mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16753 Separate elementwise level2 math functions i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13954928 fbshipit-source-id: 1ca7a5d3da96e32510f502e5e4e79168854bee67
109 lines
4.2 KiB
Plaintext
109 lines
4.2 KiB
Plaintext
#include "caffe2/utils/math/broadcast.h"
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/math/utils.h"
|
|
|
|
namespace caffe2 {
|
|
namespace math {
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void AffineChannelNCHWCUDAKernel(
|
|
const int C,
|
|
const int M,
|
|
const int HxW,
|
|
const T* X,
|
|
const T* scale,
|
|
const T* bias,
|
|
T* Y);
|
|
|
|
template <>
|
|
__global__ void AffineChannelNCHWCUDAKernel<float>(
|
|
const int C,
|
|
const int M,
|
|
const int HxW,
|
|
const float* X,
|
|
const float* scale,
|
|
const float* bias,
|
|
float* Y) {
|
|
const int nc = blockIdx.x / M;
|
|
const int c = nc % C;
|
|
const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (w < HxW) {
|
|
const int index = nc * HxW + w;
|
|
#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
|
|
Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
|
|
#else
|
|
Y[index] = fmaf(X[index], scale[c], bias[c]);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void AffineChannelNHWCCUDAKernel(
|
|
const int C,
|
|
const T* X,
|
|
const T* scale,
|
|
const T* bias,
|
|
T* Y);
|
|
|
|
template <>
|
|
__global__ void AffineChannelNHWCCUDAKernel<float>(
|
|
const int C,
|
|
const float* X,
|
|
const float* scale,
|
|
const float* bias,
|
|
float* Y) {
|
|
const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (c < C) {
|
|
const int index = blockIdx.x * C + c;
|
|
#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
|
|
Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
|
|
#else
|
|
Y[index] = fmaf(X[index], scale[c], bias[c]);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NCHW>( \
|
|
const int N, \
|
|
const int C, \
|
|
const int HxW, \
|
|
const T* X, \
|
|
const T* scale, \
|
|
const T* bias, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
const int M = DivUp(HxW, CAFFE_CUDA_NUM_THREADS); \
|
|
AffineChannelNCHWCUDAKernel<T> \
|
|
<<<N * C * M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
C, M, HxW, X, scale, bias, Y); \
|
|
} \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NHWC>( \
|
|
const int N, \
|
|
const int C, \
|
|
const int HxW, \
|
|
const T* X, \
|
|
const T* scale, \
|
|
const T* bias, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
const int M = DivUp(C, CAFFE_CUDA_NUM_THREADS); \
|
|
AffineChannelNHWCCUDAKernel<T> \
|
|
<<<dim3(N* HxW, M), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(C, X, scale, bias, Y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(float)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL
|
|
|
|
} // namespace math
|
|
} // namespace caffe2
|