pytorch/caffe2/utils/math/reduce.cuh
Xiang Gao b8dfb45ac2 Refactor cub namespace handling (#66219)
Summary:
This PR is to update PyTorch with the following cub changes:
- Starting cub 1.13.1, cub requires users to define `CUB_NS_QUALIFIER` if `CUB_NS_PREFIX` is also defined. Besides that, a new mechanism `CUB_WRAPPED_NAMESPACE` is added.

And I do the following change to PyTorch:
- Starting CUDA 11.5, define `CUB_WRAPPED_NAMESPACE` globally as an nvcc flag.
- Fix caffe2 failures caused by the above change.
- Add a `aten/src/ATen/cuda/cub_definitions.cuh` that defines helper macros about feature availability.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/66219

Reviewed By: bdhirsh

Differential Revision: D31626931

Pulled By: ngimel

fbshipit-source-id: 97ebf5ef671ade8bf46d0860edc317f22660f26d
2021-10-25 14:37:09 -07:00

62 lines
3.3 KiB
Plaintext

#ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_
#define CAFFE2_UTILS_MATH_REDUCE_CUH_
#include "caffe2/utils/cub_namespace.cuh"
#include <cub/block/block_reduce.cuh>
#include "caffe2/core/common_gpu.h"
namespace caffe2 {
template <typename T>
using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
template <typename T, int kBlockDimX, int kBlockDimY>
using BlockReduce2D = cub::
BlockReduce<T, kBlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, kBlockDimY>;
#define DISPATCH_REDUCE_KERNEL_BY_2D_BLOCK_WITH_TYPE_1( \
size, Func, T, grid_dim, cuda_stream, ...) \
do { \
if (size >= 128) { \
Func<T, 1, 128> \
<<<grid_dim, dim3(1, 128), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} else if (size >= 64) { \
Func<T, 2, 64><<<grid_dim, dim3(2, 64), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} else if (size >= 32) { \
Func<T, 4, 32><<<grid_dim, dim3(4, 32), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} else { \
Func<T, 8, 16><<<grid_dim, dim3(8, 16), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} \
} while (false)
#define DISPATCH_REDUCE_KERNEL_BY_2D_BLOCK_WITH_TYPE_2( \
size, Func, T1, T2, grid_dim, cuda_stream, ...) \
do { \
if (size >= 128) { \
Func<T1, T2, 1, 128> \
<<<grid_dim, dim3(1, 128), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} else if (size >= 64) { \
Func<T1, T2, 2, 64> \
<<<grid_dim, dim3(2, 64), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} else if (size >= 32) { \
Func<T1, T2, 4, 32> \
<<<grid_dim, dim3(4, 32), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} else { \
Func<T1, T2, 8, 16> \
<<<grid_dim, dim3(8, 16), 0, cuda_stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
} \
} while (false)
} // namespace caffe2
#endif // CAFFE2_UTILS_MATH_REDUCE_CUH_