mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11943 See title Reviewed By: ezyang Differential Revision: D9992645 fbshipit-source-id: e8f80d6ea762971513e5e8072975ceea53e1f11a
108 lines
2.6 KiB
Plaintext
108 lines
2.6 KiB
Plaintext
#include "caffe2/operators/arg_ops.h"
|
|
|
|
#include <limits>
|
|
|
|
#include <cub/block/block_reduce.cuh>
|
|
#include <cub/cub.cuh>
|
|
|
|
#include "caffe2/core/common_gpu.h"
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/fixed_divisor.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
|
|
template <typename K, typename V>
|
|
using KeyValuePair = cub::KeyValuePair<K, V>;
|
|
|
|
template <typename K, typename V>
|
|
using BlockReduce =
|
|
cub::BlockReduce<KeyValuePair<K, V>, CAFFE_CUDA_NUM_THREADS>;
|
|
|
|
template <typename T, class Reducer>
|
|
__global__ void ComputeArgCUDAKernel(
|
|
const int outer_size,
|
|
const int inner_size,
|
|
const FixedDivisor<int> stride,
|
|
const Reducer reducer,
|
|
const T init,
|
|
const T* X,
|
|
int64_t* Y) {
|
|
__shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
|
|
const int d = stride.d();
|
|
for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
|
|
int i;
|
|
int j;
|
|
stride.DivMod(idx, &i, &j);
|
|
KeyValuePair<int, T> kv = {-1, init};
|
|
for (int k = threadIdx.x; k < inner_size; k += blockDim.x) {
|
|
kv = reducer({k, X[i * inner_size * d + k * d + j]}, kv);
|
|
}
|
|
kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
|
|
if (threadIdx.x == 0) {
|
|
Y[idx] = static_cast<int64_t>(kv.key);
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool ArgMaxReducer<CUDAContext>::operator()(
|
|
const int prev_size,
|
|
const int next_size,
|
|
const int n,
|
|
const T* X,
|
|
int64_t* Y,
|
|
CUDAContext* context) const {
|
|
const int outer_size = prev_size * next_size;
|
|
const FixedDivisor<int> stride(next_size);
|
|
ComputeArgCUDAKernel<<<
|
|
std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size,
|
|
n,
|
|
stride,
|
|
cub::ArgMax(),
|
|
std::numeric_limits<T>::lowest(),
|
|
X,
|
|
Y);
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool ArgMinReducer<CUDAContext>::operator()(
|
|
const int prev_size,
|
|
const int next_size,
|
|
const int n,
|
|
const T* X,
|
|
int64_t* Y,
|
|
CUDAContext* context) const {
|
|
const int outer_size = prev_size * next_size;
|
|
const FixedDivisor<int> stride(next_size);
|
|
ComputeArgCUDAKernel<<<
|
|
std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size,
|
|
n,
|
|
stride,
|
|
cub::ArgMin(),
|
|
std::numeric_limits<T>::max(),
|
|
X,
|
|
Y);
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(ArgMax, ArgOp<CUDAContext, ArgMaxReducer<CUDAContext>>);
|
|
REGISTER_CUDA_OPERATOR(ArgMin, ArgOp<CUDAContext, ArgMinReducer<CUDAContext>>);
|
|
|
|
} // namespace caffe2
|