mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Enables an important performance optimization for ROCm, in light of the discussion in https://github.com/pytorch/pytorch/issues/41028. CC jithunnair-amd sunway513 Pull Request resolved: https://github.com/pytorch/pytorch/pull/60607 Reviewed By: jbschlosser Differential Revision: D29409894 Pulled By: ngimel fbshipit-source-id: effca258a0f37eaefa35674a7fd19459ca7dc95b
143 lines
4.7 KiB
Plaintext
143 lines
4.7 KiB
Plaintext
#include <fstream>
|
|
#include "caffe2/core/common_gpu.h"
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/batch_gather_ops.h"
|
|
// Shared batch kernel
|
|
#include "caffe2/operators/gather_op.cuh"
|
|
#include "caffe2/utils/GpuAtomics.cuh"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <>
|
|
bool BatchGatherOp<CUDAContext>::RunOnDevice() {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
|
|
this, OperatorBase::Input<Tensor>(INDICES, CUDA));
|
|
}
|
|
|
|
template <>
|
|
template <typename TInd>
|
|
bool BatchGatherOp<CUDAContext>::DoRunWithType() {
|
|
// BatchGather is a special-case of Gather with Axis = 1, wrap = false.
|
|
return gather_helper::gather_impl_cuda<TInd>(
|
|
this, DATA, INDICES, 0, 1, false, match_outer_);
|
|
}
|
|
|
|
template <typename T_INDEX, typename TData>
|
|
__global__ void BatchGatherGradientKernel(
|
|
const TData* grad_data,
|
|
TData* out,
|
|
const T_INDEX* indices,
|
|
const int outer_dims_product,
|
|
const int N,
|
|
const int data_batch_size,
|
|
const int gathered_batch_size,
|
|
const int block_size,
|
|
const int src_indexing_axis_dim,
|
|
const bool wrap_indices) {
|
|
int begin_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int num_items = outer_dims_product * N * block_size;
|
|
|
|
for (int s = begin_idx; s < num_items; s += blockDim.x * gridDim.x) {
|
|
const int k = s % block_size;
|
|
const int j = s / block_size % N;
|
|
const int i = s / block_size / N;
|
|
T_INDEX idx = indices[j];
|
|
if (wrap_indices && idx < 0) {
|
|
idx = idx + src_indexing_axis_dim;
|
|
}
|
|
const float* src_offset =
|
|
grad_data + i * gathered_batch_size + j * block_size;
|
|
float* dst_offset = out + i * data_batch_size + idx * block_size;
|
|
gpu_atomic_add(dst_offset + k, src_offset[k]);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
bool BatchGatherGradientOp<CUDAContext>::RunOnDevice() {
|
|
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
|
|
this, OperatorBase::Input<Tensor>(INDICES, CUDA));
|
|
}
|
|
|
|
template <>
|
|
template <typename TInd>
|
|
bool BatchGatherGradientOp<CUDAContext>::DoRunWithType() {
|
|
return DispatchHelper<
|
|
TensorTypes2<float, GenericTensorImplementation>,
|
|
TInd>::call(this, OperatorBase::Input<Tensor>(DATA, CUDA));
|
|
}
|
|
|
|
template <>
|
|
template <typename TInd, typename TData>
|
|
bool BatchGatherGradientOp<CUDAContext>::DoRunWithType2() {
|
|
CAFFE_ENFORCE(
|
|
!match_outer_, "match_outer=true is currently only supported for CPU");
|
|
|
|
auto& data = Input(DATA);
|
|
auto& indices = Input(INDICES);
|
|
auto& grad = Input(GRAD);
|
|
|
|
// ONNX allows negative axis to index from the back, valid range: [-r, r].
|
|
int axis = axis_;
|
|
if (axis < 0) {
|
|
axis = data.dim() + axis;
|
|
}
|
|
// Outer dimensions of input data and gradient should be the same
|
|
// because they are preserved for gathers with axis > 0.
|
|
for (int acheck = 0; acheck < axis; acheck++) {
|
|
CAFFE_ENFORCE_EQ(
|
|
data.size(acheck), grad.size(acheck), "batch sizes should be the same");
|
|
}
|
|
|
|
auto* output = Output(0, data.sizes(), at::dtype<float>());
|
|
auto* out_data = output->template mutable_data<float>();
|
|
math::Set<float, CUDAContext>(output->numel(), 0, out_data, &context_);
|
|
|
|
const auto* grad_data = grad.template data<float>();
|
|
const TInd* idxs = indices.template data<TInd>();
|
|
|
|
// Treat all outer dimensions as a unit as they contribute to larger batch.
|
|
const int outer_dims_product = grad.size_to_dim(axis);
|
|
const int block_size = data.size_from_dim(axis + 1);
|
|
|
|
const int N = indices.numel();
|
|
const auto data_batch_size = data.size_from_dim(axis);
|
|
const auto gathered_batch_size = N * block_size;
|
|
const int src_indexing_axis_dim = data.dim(axis);
|
|
|
|
// Assign each thread index its own 'float' in block_size * N (kernel will
|
|
// loop if there is more data than fits NUM_BLOCKS * NUM_THREADS limit).
|
|
BatchGatherGradientKernel<<<
|
|
std::min(outer_dims_product, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
std::min(N * block_size, CAFFE_CUDA_NUM_THREADS),
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
grad_data,
|
|
out_data,
|
|
idxs,
|
|
outer_dims_product,
|
|
N,
|
|
data_batch_size,
|
|
gathered_batch_size,
|
|
block_size,
|
|
src_indexing_axis_dim,
|
|
false);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); // TBD: Add proper index wrapping support to Gather gradients.
|
|
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
template <typename TInd>
|
|
bool BatchGatherGradientOp<CUDAContext>::DoRunWithOtherType2() {
|
|
CAFFE_THROW(
|
|
"BatchGatherGradient is not implemented on tensor of type ",
|
|
Input(DATA).meta().name(),
|
|
"consider adding it as a type in the DispatchHelper list or implementing"
|
|
" a generic version (which won't work for duplicated indices though)");
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(BatchGather, BatchGatherOp<CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(BatchGatherGradient, BatchGatherGradientOp<CUDAContext>);
|
|
|
|
} // namespace caffe2
|