mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary:
This reverts commit d73c830e23.
We have observed significant perf drop when training ResNext101 with multiple amd GPUs:
Before:
https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-clang7-rocmdeb-ubuntu16.04-bench/1636/console
2 GPUs ResNext training got 150\~160 imgs/sec
4 GPUs ResNext training got 270\~280 imgs/sec
After:
https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-clang7-rocmdeb-ubuntu16.04-bench/1637/console
Both 2 and 4 GPUs ResNext training drop to 110\~120 imgs/sec
Similar perf drop are seen on ResNet50 training jobs as well.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18680
Differential Revision: D14702941
Pulled By: bddppq
fbshipit-source-id: 828141805afc23f25c08d4a2eb6d4b99f817c128
346 lines
9.6 KiB
Plaintext
346 lines
9.6 KiB
Plaintext
#include <cub/cub.cuh>
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/pack_segments.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
|
|
template <typename T, typename Data_T>
|
|
__global__ void PackSegmentsKernel(
|
|
const Data_T* data_ptr,
|
|
const T* lengths_ptr,
|
|
const T* lengths_cum_sum,
|
|
const T max_length,
|
|
const int64_t num_seq,
|
|
const int64_t cell_size,
|
|
Data_T padding,
|
|
bool* presence_ptr,
|
|
Data_T* out_ptr) {
|
|
CUDA_1D_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
|
|
int seq = (i / cell_size) / max_length;
|
|
int cell = (i / cell_size) % max_length;
|
|
int offset = i % cell_size;
|
|
if (presence_ptr && offset == 0) {
|
|
presence_ptr[i / cell_size] = cell < lengths_ptr[seq];
|
|
}
|
|
if (cell >= lengths_ptr[seq]) {
|
|
out_ptr[i] = padding;
|
|
} else {
|
|
int32_t idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
|
|
out_ptr[i] = data_ptr[idx];
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, typename Data_T>
|
|
__global__ void UnpackSegmentsKernel(
|
|
const Data_T* data_ptr,
|
|
const T* lengths_ptr,
|
|
const T* lengths_cum_sum,
|
|
const T max_length,
|
|
const int64_t num_seq,
|
|
const int64_t cell_size,
|
|
Data_T* out_ptr) {
|
|
CUDA_1D_KERNEL_LOOP(i, num_seq * max_length * cell_size) {
|
|
int seq = (i / cell_size) / max_length;
|
|
int cell = (i / cell_size) % max_length;
|
|
int offset = i % cell_size;
|
|
if (cell < lengths_ptr[seq]) {
|
|
int idx = (lengths_cum_sum[seq] + cell) * cell_size + offset;
|
|
out_ptr[idx] = data_ptr[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
int64_t int_array_sum(
|
|
const T* dev_array,
|
|
int64_t num_items,
|
|
Tensor& dev_buffer,
|
|
Tensor& dev_sum,
|
|
Tensor& host_sum,
|
|
CUDAContext& context) {
|
|
// Retrieve buffer size
|
|
size_t temp_storage_bytes = 0;
|
|
cub::DeviceReduce::Sum(
|
|
nullptr,
|
|
temp_storage_bytes,
|
|
dev_array,
|
|
dev_sum.mutable_data<int64_t>(),
|
|
num_items,
|
|
context.cuda_stream());
|
|
|
|
// Allocate temporary storage
|
|
auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
|
|
dev_buffer.Resize(buffer_size);
|
|
void* dev_temp_storage = static_cast<void*>(dev_buffer.mutable_data<T>());
|
|
|
|
// Find sumimum
|
|
cub::DeviceReduce::Sum(
|
|
dev_temp_storage,
|
|
temp_storage_bytes,
|
|
dev_array,
|
|
dev_sum.mutable_data<int64_t>(),
|
|
num_items,
|
|
context.cuda_stream());
|
|
|
|
// Copy to host
|
|
host_sum.CopyFrom(dev_sum);
|
|
context.FinishDeviceComputation();
|
|
return *host_sum.data<int64_t>();
|
|
}
|
|
|
|
template <typename T>
|
|
T array_max(
|
|
const T* dev_array,
|
|
int64_t num_items,
|
|
Tensor& dev_max_buffer,
|
|
Tensor& dev_max,
|
|
Tensor& host_max,
|
|
CUDAContext& context) {
|
|
// Retrieve buffer size
|
|
size_t temp_storage_bytes = 0;
|
|
cub::DeviceReduce::Max(
|
|
nullptr,
|
|
temp_storage_bytes,
|
|
dev_array,
|
|
dev_max.mutable_data<T>(),
|
|
num_items,
|
|
context.cuda_stream());
|
|
|
|
// Allocate temporary storage
|
|
auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
|
|
dev_max_buffer.Resize(buffer_size);
|
|
void* dev_temp_storage = static_cast<void*>(dev_max_buffer.mutable_data<T>());
|
|
|
|
// Find maximum
|
|
cub::DeviceReduce::Max(
|
|
dev_temp_storage,
|
|
temp_storage_bytes,
|
|
dev_array,
|
|
dev_max.mutable_data<T>(),
|
|
num_items,
|
|
context.cuda_stream());
|
|
|
|
// Copy to host
|
|
host_max.CopyFrom(dev_max);
|
|
context.FinishDeviceComputation();
|
|
return *host_max.data<T>();
|
|
}
|
|
|
|
template <typename T>
|
|
void array_prefix_sum_exclusive(
|
|
const T* dev_array,
|
|
const int32_t num_items,
|
|
Tensor& prefix_buffer,
|
|
Tensor& prefix_sum,
|
|
CUDAContext& context) {
|
|
// Retrieve buffer size
|
|
size_t temp_storage_bytes = 0;
|
|
prefix_sum.Resize(num_items);
|
|
cub::DeviceScan::ExclusiveSum(
|
|
nullptr,
|
|
temp_storage_bytes,
|
|
dev_array,
|
|
prefix_sum.mutable_data<T>(),
|
|
num_items,
|
|
context.cuda_stream());
|
|
|
|
// Allocate temporary storage
|
|
auto buffer_size = (temp_storage_bytes + sizeof(T)) / sizeof(T);
|
|
prefix_buffer.Resize(buffer_size);
|
|
void* dev_temp_storage = static_cast<void*>(prefix_buffer.mutable_data<T>());
|
|
|
|
// Exclusive sum
|
|
cub::DeviceScan::ExclusiveSum(
|
|
dev_temp_storage,
|
|
temp_storage_bytes,
|
|
dev_array,
|
|
prefix_sum.mutable_data<T>(),
|
|
num_items,
|
|
context.cuda_stream());
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool PackSegmentsOp<CUDAContext>::DoRunWithType() {
|
|
return DispatchHelper<TensorTypes2<char, int32_t, int64_t, float>, T>::call(
|
|
this, Input(DATA));
|
|
}
|
|
|
|
template <>
|
|
template <typename T, typename Data_T>
|
|
bool PackSegmentsOp<CUDAContext>::DoRunWithType2() {
|
|
const auto& data = Input(DATA);
|
|
const auto& lengths = Input(LENGTHS);
|
|
int64_t num_seq = lengths.dim(0);
|
|
const Data_T* data_ptr = data.data<Data_T>();
|
|
const T* lengths_ptr = lengths.data<T>();
|
|
auto* out = Output(0);
|
|
Tensor* presence_mask = nullptr;
|
|
if (return_presence_mask_) {
|
|
presence_mask = Output(1);
|
|
}
|
|
|
|
CAFFE_ENFORCE_GE(data.dim(), 1, "DATA should be at least 1-D");
|
|
CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTH should be 1-D");
|
|
|
|
// Find the length of the longest sequence.
|
|
dev_max_length_.Resize(1);
|
|
host_max_length_.Resize(1);
|
|
|
|
T temp = num_seq > 0 ? array_max<T>(
|
|
lengths_ptr,
|
|
num_seq,
|
|
dev_buffer_,
|
|
dev_max_length_,
|
|
host_max_length_,
|
|
context_)
|
|
: 0;
|
|
if (max_length_ != -1) {
|
|
CAFFE_ENFORCE_GE(
|
|
max_length_,
|
|
temp,
|
|
"Pre-defined max_length should be greater than the real max_length");
|
|
temp = max_length_;
|
|
}
|
|
const T& max_length = temp;
|
|
// Compute prefix sum over the lengths
|
|
array_prefix_sum_exclusive<T>(
|
|
lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
|
|
bool* presence_mask_data = nullptr;
|
|
if (return_presence_mask_) {
|
|
std::vector<int64_t> presence_shape{lengths.numel(), max_length};
|
|
presence_mask->Resize(presence_shape);
|
|
presence_mask_data = presence_mask->template mutable_data<bool>();
|
|
}
|
|
|
|
// create output tensor
|
|
auto shape = data.sizes().vec(); // Shape of out is batch_size x max_len x ...
|
|
shape[0] = max_length;
|
|
shape.insert(shape.begin(), lengths.numel());
|
|
out->Resize(shape);
|
|
Data_T* out_ptr = static_cast<Data_T*>(out->raw_mutable_data(data.meta()));
|
|
|
|
// Return empty out (with the proper shape) if first dim is 0.
|
|
if (!data.dim(0)) {
|
|
return true;
|
|
}
|
|
|
|
// Do padding
|
|
Data_T padding = out->IsType<float>() ? padding_ : 0;
|
|
int64_t cell_size = data.numel() / data.dim(0);
|
|
PackSegmentsKernel<<<
|
|
CAFFE_GET_BLOCKS(num_seq * max_length * cell_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
data_ptr,
|
|
lengths_ptr,
|
|
dev_lengths_prefix_sum_.data<T>(),
|
|
max_length,
|
|
num_seq,
|
|
cell_size,
|
|
padding,
|
|
presence_mask_data,
|
|
out_ptr);
|
|
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool UnpackSegmentsOp<CUDAContext>::DoRunWithType() {
|
|
return DispatchHelper<TensorTypes2<char, int32_t, int64_t, float>, T>::call(
|
|
this, Input(DATA));
|
|
}
|
|
template <>
|
|
template <typename T, typename Data_T>
|
|
bool UnpackSegmentsOp<CUDAContext>::DoRunWithType2() {
|
|
const auto& data = Input(DATA);
|
|
const auto& lengths = Input(LENGTHS);
|
|
int64_t num_seq = lengths.dim(0);
|
|
const Data_T* data_ptr = data.data<Data_T>();
|
|
const T* lengths_ptr = lengths.data<T>();
|
|
auto* out = Output(0);
|
|
|
|
CAFFE_ENFORCE_GE(data.dim(), 1, "DATA should be at least 1-D");
|
|
CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTH should be 1-D");
|
|
// Compute prefix sum over the lengths
|
|
array_prefix_sum_exclusive<T>(
|
|
lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
|
|
|
|
// compute max of the lengths
|
|
dev_max_length_.Resize(1);
|
|
host_max_length_.Resize(1);
|
|
T temp = num_seq > 0 ? array_max<T>(
|
|
lengths_ptr,
|
|
num_seq,
|
|
dev_buffer_,
|
|
dev_max_length_,
|
|
host_max_length_,
|
|
context_)
|
|
: 0;
|
|
if (max_length_ != -1) {
|
|
CAFFE_ENFORCE_EQ(
|
|
max_length_,
|
|
data.dim(1),
|
|
"max_length should be equal to the packed segments");
|
|
|
|
CAFFE_ENFORCE_GE(
|
|
max_length_,
|
|
temp,
|
|
"Pre-defined max_length should be greater than the real max_length");
|
|
|
|
temp = max_length_;
|
|
}
|
|
const T& max_length = temp;
|
|
// compute num of cells: sum of the lengths
|
|
dev_num_cell_.Resize(1);
|
|
host_num_cell_.Resize(1);
|
|
const int64_t num_cell = int_array_sum<T>(
|
|
lengths_ptr,
|
|
num_seq,
|
|
dev_buffer_,
|
|
dev_num_cell_,
|
|
host_num_cell_,
|
|
context_);
|
|
|
|
// create output tensor
|
|
auto shape = data.sizes().vec();
|
|
CAFFE_ENFORCE_EQ(
|
|
shape[0], lengths.dim(0), "LENGTH should match DATA in dimension 0");
|
|
shape.erase(shape.begin());
|
|
shape[0] = num_cell;
|
|
out->Resize(shape);
|
|
Data_T* out_ptr = static_cast<Data_T*>(out->raw_mutable_data(data.meta()));
|
|
|
|
// Return empty out (with the proper shape) if any of the dimensions is 0.
|
|
if (data.dim(0) == 0 || data.dim(1) == 0) {
|
|
return true;
|
|
}
|
|
|
|
// Unpack
|
|
int64_t cell_size = data.numel() / (data.dim(0) * data.dim(1));
|
|
UnpackSegmentsKernel<<<
|
|
CAFFE_GET_BLOCKS(num_seq * max_length * cell_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
data_ptr,
|
|
lengths_ptr,
|
|
dev_lengths_prefix_sum_.data<T>(),
|
|
max_length,
|
|
num_seq,
|
|
cell_size,
|
|
out_ptr);
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(UnpackSegments, UnpackSegmentsOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(PackSegments, PackSegmentsOp<CUDAContext>);
|
|
} // namespace caffe2
|