mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: This PR is to update PyTorch with the following cub changes: - Starting cub 1.13.1, cub requires users to define `CUB_NS_QUALIFIER` if `CUB_NS_PREFIX` is also defined. Besides that, a new mechanism `CUB_WRAPPED_NAMESPACE` is added. And I do the following change to PyTorch: - Starting CUDA 11.5, define `CUB_WRAPPED_NAMESPACE` globally as an nvcc flag. - Fix caffe2 failures caused by the above change. - Add a `aten/src/ATen/cuda/cub_definitions.cuh` that defines helper macros about feature availability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/66219 Reviewed By: bdhirsh Differential Revision: D31626931 Pulled By: ngimel fbshipit-source-id: 97ebf5ef671ade8bf46d0860edc317f22660f26d
62 lines
3.3 KiB
Plaintext
62 lines
3.3 KiB
Plaintext
#ifndef CAFFE2_UTILS_MATH_REDUCE_CUH_
|
|
#define CAFFE2_UTILS_MATH_REDUCE_CUH_
|
|
|
|
#include "caffe2/utils/cub_namespace.cuh"
|
|
#include <cub/block/block_reduce.cuh>
|
|
|
|
#include "caffe2/core/common_gpu.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <typename T>
|
|
using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
|
|
|
|
template <typename T, int kBlockDimX, int kBlockDimY>
|
|
using BlockReduce2D = cub::
|
|
BlockReduce<T, kBlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, kBlockDimY>;
|
|
|
|
#define DISPATCH_REDUCE_KERNEL_BY_2D_BLOCK_WITH_TYPE_1( \
|
|
size, Func, T, grid_dim, cuda_stream, ...) \
|
|
do { \
|
|
if (size >= 128) { \
|
|
Func<T, 1, 128> \
|
|
<<<grid_dim, dim3(1, 128), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} else if (size >= 64) { \
|
|
Func<T, 2, 64><<<grid_dim, dim3(2, 64), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} else if (size >= 32) { \
|
|
Func<T, 4, 32><<<grid_dim, dim3(4, 32), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} else { \
|
|
Func<T, 8, 16><<<grid_dim, dim3(8, 16), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} \
|
|
} while (false)
|
|
|
|
#define DISPATCH_REDUCE_KERNEL_BY_2D_BLOCK_WITH_TYPE_2( \
|
|
size, Func, T1, T2, grid_dim, cuda_stream, ...) \
|
|
do { \
|
|
if (size >= 128) { \
|
|
Func<T1, T2, 1, 128> \
|
|
<<<grid_dim, dim3(1, 128), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} else if (size >= 64) { \
|
|
Func<T1, T2, 2, 64> \
|
|
<<<grid_dim, dim3(2, 64), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} else if (size >= 32) { \
|
|
Func<T1, T2, 4, 32> \
|
|
<<<grid_dim, dim3(4, 32), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} else { \
|
|
Func<T1, T2, 8, 16> \
|
|
<<<grid_dim, dim3(8, 16), 0, cuda_stream>>>(__VA_ARGS__); \
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
|
|
} \
|
|
} while (false)
|
|
|
|
} // namespace caffe2
|
|
|
|
#endif // CAFFE2_UTILS_MATH_REDUCE_CUH_
|