mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33563 When NVCC or Clang are driving CUDA compilation many math functions are declared by default, with a small difference: Clang marks them as `__device__` only, while NVCC uses both `__host__` and `__device__`. This makes every un-elaborated `min` or `max` function call from a `__host__` function generate a syntax error when Clang is used. Fix the errors by using `std::min` and `std::max` from `<algorithm>`, since C++14 they are `constexpr` and can be used in the `__device__` code [1]. 1. https://llvm.org/docs/CompileCudaWithLLVM.html#algorithm Test Plan: ```lang=bash buck build mode/opt -c fbcode.cuda_use_clang=true //fblearner/flow/projects/dper:workflow buck build mode/opt //fblearner/flow/projects/dper:workflow ``` Execute tests on devgpu: ``` buck test mode/dev-nosan -j 8 //caffe2/caffe2/python/operator_test/... //caffe2/test:cuda ``` Reviewed By: ngimel Differential Revision: D20005795 fbshipit-source-id: 98a3f35e8a96c15d3ad3d2066396591f5cca1696
182 lines
4.9 KiB
Plaintext
182 lines
4.9 KiB
Plaintext
#include <algorithm>
|
|
|
|
#include <cub/block/block_reduce.cuh>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/normalize_l1_op.h"
|
|
#include "caffe2/operators/normalize_op.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
__global__ void NormalizeKernel(
|
|
const int m,
|
|
const int n,
|
|
const int sf,
|
|
const float* xData,
|
|
float* yData,
|
|
const float kEps) {
|
|
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
|
|
__shared__ BlockReduce::TempStorage temp_storage;
|
|
|
|
for (int i = blockIdx.x; i < n; i += gridDim.x) {
|
|
auto base = (i / sf) * sf * m + (i % sf);
|
|
|
|
float sum = 0.0;
|
|
__shared__ float norm;
|
|
for (int j = threadIdx.x; j < m; j += blockDim.x) {
|
|
const auto x_ij = xData[base + j * sf];
|
|
sum += x_ij * x_ij;
|
|
}
|
|
float reduce_result = BlockReduce(temp_storage).Sum(sum);
|
|
|
|
if (threadIdx.x == 0) {
|
|
norm = sqrtf(reduce_result);
|
|
norm = fmaxf(norm, kEps);
|
|
}
|
|
__syncthreads();
|
|
for (int j = threadIdx.x; j < m; j += blockDim.x) {
|
|
const auto index = base + j * sf;
|
|
yData[index] = xData[index] / norm;
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void NormalizeGradientKernel(
|
|
const int M,
|
|
const int N,
|
|
const int SF,
|
|
const float* in_mat,
|
|
const float* grad_out_mat,
|
|
float* grad_mat,
|
|
const float kEps) {
|
|
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
|
|
__shared__ BlockReduce::TempStorage temp_storage_sum;
|
|
__shared__ BlockReduce::TempStorage temp_storage_norm;
|
|
for (int i = blockIdx.x; i < M; i += gridDim.x) {
|
|
float sum = 0.0;
|
|
float norm = 0.0;
|
|
__shared__ float row_sum;
|
|
__shared__ float row_norm;
|
|
__shared__ float row_norm_3;
|
|
auto base = (i / SF) * SF * N + (i % SF);
|
|
for (int j = threadIdx.x; j < N; j += blockDim.x) {
|
|
int index = base + j * SF;
|
|
sum += in_mat[index] * grad_out_mat[index];
|
|
norm += in_mat[index] * in_mat[index];
|
|
}
|
|
float reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
|
|
float reduce_norm = BlockReduce(temp_storage_norm).Sum(norm);
|
|
|
|
if (threadIdx.x == 0) {
|
|
row_sum = reduce_result;
|
|
row_norm = sqrtf(reduce_norm);
|
|
row_norm = fmaxf(row_norm, kEps);
|
|
row_norm_3 = powf(row_norm, 3);
|
|
}
|
|
__syncthreads();
|
|
for (int j = threadIdx.x; j < N; j += blockDim.x) {
|
|
int index = base + j * SF;
|
|
const float x_ij = in_mat[index];
|
|
const float dy_ij = grad_out_mat[index];
|
|
grad_mat[index] = (dy_ij / row_norm) - ((x_ij / row_norm_3) * row_sum);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
void NormalizeOp<float, CUDAContext>::DoNormalize(
|
|
const float* xData,
|
|
float* yData,
|
|
const int m,
|
|
const int n,
|
|
const int sf) {
|
|
NormalizeKernel<<<
|
|
std::min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(m, n, sf, xData, yData, kEps_);
|
|
}
|
|
|
|
template <>
|
|
bool NormalizeGradientOp<float, CUDAContext>::RunOnDevice() {
|
|
const auto& X = Input(0);
|
|
const auto& dY = Input(1);
|
|
auto* dX = Output(0);
|
|
dX->ResizeLike(X);
|
|
|
|
const auto canonical_axis =
|
|
X.canonical_axis_index(OperatorBase::GetSingleArgument<int>("axis", -1));
|
|
int N = X.dim32(canonical_axis);
|
|
int M = X.numel() / N;
|
|
const int SF = X.size_from_dim(canonical_axis + 1);
|
|
NormalizeGradientKernel<<<
|
|
std::min(M, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
M,
|
|
N,
|
|
SF,
|
|
X.data<float>(),
|
|
dY.data<float>(),
|
|
dX->template mutable_data<float>(),
|
|
kEps_);
|
|
return true;
|
|
}
|
|
|
|
namespace {
|
|
__global__ void NormalizeL1Kernel(
|
|
const int m,
|
|
const int n,
|
|
const int sf,
|
|
const float* xData,
|
|
float* yData) {
|
|
typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
|
|
__shared__ BlockReduce::TempStorage temp_storage;
|
|
|
|
for (int i = blockIdx.x; i < n; i += gridDim.x) {
|
|
auto base = (i / sf) * sf * m + (i % sf);
|
|
|
|
float sum = 0.0;
|
|
__shared__ float norm;
|
|
for (int j = threadIdx.x; j < m; j += blockDim.x) {
|
|
const auto x_ij = xData[base + j * sf];
|
|
sum += fabsf(x_ij);
|
|
}
|
|
float reduce_result = BlockReduce(temp_storage).Sum(sum);
|
|
|
|
if (threadIdx.x == 0) {
|
|
norm = reduce_result;
|
|
}
|
|
__syncthreads();
|
|
if (norm != 0) {
|
|
for (int j = threadIdx.x; j < m; j += blockDim.x) {
|
|
const auto index = base + j * sf;
|
|
yData[index] = xData[index] / norm;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void NormalizeL1Op<float, CUDAContext>::DoNormalize(
|
|
const float* xData,
|
|
float* yData,
|
|
const int m,
|
|
const int n,
|
|
const int sf) {
|
|
NormalizeL1Kernel<<<
|
|
std::min(n, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(m, n, sf, xData, yData);
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(Normalize, NormalizeOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
NormalizeGradient,
|
|
NormalizeGradientOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(NormalizeL1, NormalizeL1Op<float, CUDAContext>);
|
|
} // namespace caffe2
|