mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Codemod generated with clangr shard mode, 25 files per diff, Reviewed By: dzhulgakov Differential Revision: D13944296 fbshipit-source-id: 67e97c2cf45889d25f2cb3e2203cecba03c8a3aa
352 lines
10 KiB
Plaintext
352 lines
10 KiB
Plaintext
// TODO: reduce the apparent redundancy of all the code below.
|
|
#include <cfloat>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/pool_op.h"
|
|
|
|
namespace caffe2 {
|
|
namespace {
|
|
struct LpPoolFunctor {
|
|
explicit LpPoolFunctor(const OperatorBase& /* op */) {}
|
|
};
|
|
} // namespace
|
|
|
|
namespace {
|
|
|
|
using c10::cuda::compat::abs;
|
|
using c10::cuda::compat::pow;
|
|
|
|
template <typename T>
|
|
__global__ void LpPoolForwardNCHW(
|
|
const int nthreads,
|
|
const T* bottom_data,
|
|
const int num,
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int pooled_height,
|
|
const int pooled_width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
T* top_data,
|
|
const T p) {
|
|
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
|
int n = index;
|
|
int pw = n % pooled_width;
|
|
n /= pooled_width;
|
|
int ph = n % pooled_height;
|
|
n /= pooled_height;
|
|
int c = n % channels;
|
|
n /= channels;
|
|
int hstart = ph * stride_h - pad_t;
|
|
int wstart = pw * stride_w - pad_l;
|
|
int hend = min(hstart + kernel_h, height);
|
|
int wend = min(wstart + kernel_w, width);
|
|
hstart = max(hstart, 0);
|
|
wstart = max(wstart, 0);
|
|
top_data[index] = 0;
|
|
int bottom_offset = (n * channels + c) * height * width;
|
|
for (int h = hstart; h < hend; ++h) {
|
|
for (int w = wstart; w < wend; ++w) {
|
|
top_data[index] +=
|
|
pow(abs(bottom_data[bottom_offset + h * width + w]), p);
|
|
}
|
|
}
|
|
top_data[index] = pow(top_data[index], static_cast<T>(1.0) / p);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void LpPoolForwardNHWC(
|
|
const int nthreads,
|
|
const T* bottom_data,
|
|
const int num,
|
|
const int height,
|
|
const int width,
|
|
const int channels,
|
|
const int pooled_height,
|
|
const int pooled_width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
T* top_data,
|
|
const T p) {
|
|
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
|
int c = index % channels;
|
|
int pw = (index / channels) % pooled_width;
|
|
int ph = (index / channels / pooled_width) % pooled_height;
|
|
int n = index / channels / pooled_width / pooled_height;
|
|
int hstart = ph * stride_h - pad_t;
|
|
int wstart = pw * stride_w - pad_l;
|
|
int hend = min(hstart + kernel_h, height);
|
|
int wend = min(wstart + kernel_w, width);
|
|
hstart = max(hstart, 0);
|
|
wstart = max(wstart, 0);
|
|
T output = 0;
|
|
int bottom_offset = n * height * width * channels + c;
|
|
for (int h = hstart; h < hend; ++h) {
|
|
for (int w = wstart; w < wend; ++w) {
|
|
output += pow(
|
|
abs(bottom_data[bottom_offset + (h * width + w) * channels]), p);
|
|
}
|
|
}
|
|
top_data[index] = pow(output, static_cast<T>(1.0) / p);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void LpPoolBackwardNCHW(
|
|
const int nthreads,
|
|
const T* const top_diff,
|
|
const T* const top_data,
|
|
const T* const bottom_data,
|
|
const int num,
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int pooled_height,
|
|
const int pooled_width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
T* const bottom_diff,
|
|
const int p) {
|
|
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
|
// find out the local index
|
|
// find out the local offset
|
|
const int w = index % width + pad_l;
|
|
const int h = (index / width) % height + pad_t;
|
|
const int c = (index / width / height) % channels;
|
|
const int n = index / width / height / channels;
|
|
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
|
|
const int phend = min(h / stride_h + 1, pooled_height);
|
|
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
|
|
const int pwend = min(w / stride_w + 1, pooled_width);
|
|
T gradient = 0;
|
|
const T* const top_diff_slice =
|
|
top_diff + (n * channels + c) * pooled_height * pooled_width;
|
|
const T* const top_data_slice =
|
|
top_data + (n * channels + c) * pooled_height * pooled_width;
|
|
|
|
for (int ph = phstart; ph < phend; ++ph) {
|
|
for (int pw = pwstart; pw < pwend; ++pw) {
|
|
// figure out the pooling size
|
|
int hstart = ph * stride_h - pad_t;
|
|
int wstart = pw * stride_w - pad_l;
|
|
int hend = min(hstart + kernel_h, height);
|
|
int wend = min(wstart + kernel_w, width);
|
|
hstart = max(hstart, 0);
|
|
wstart = max(wstart, 0);
|
|
gradient += top_diff_slice[ph * pooled_width + pw] *
|
|
bottom_data[index] * pow(abs(bottom_data[index]), p - 2) /
|
|
pow(top_data_slice[ph * pooled_width + pw], p - 1);
|
|
}
|
|
}
|
|
bottom_diff[index] = gradient;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void LpPoolBackwardNHWC(
|
|
const int nthreads,
|
|
const T* const top_diff,
|
|
const T* const top_data,
|
|
const T* const bottom_data,
|
|
const int num,
|
|
const int height,
|
|
const int width,
|
|
const int channels,
|
|
const int pooled_height,
|
|
const int pooled_width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
T* const bottom_diff,
|
|
const T p) {
|
|
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
|
// find out the local index
|
|
// find out the local offset
|
|
const int c = index % channels;
|
|
const int w = index / channels % width + pad_l;
|
|
const int h = (index / channels / width) % height + pad_t;
|
|
const int n = index / channels / width / height;
|
|
const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
|
|
const int phend = min(h / stride_h + 1, pooled_height);
|
|
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
|
|
const int pwend = min(w / stride_w + 1, pooled_width);
|
|
T gradient = 0;
|
|
const T* const top_diff_slice =
|
|
top_diff + n * pooled_height * pooled_width * channels + c;
|
|
const T* const top_data_slice =
|
|
top_data + n * pooled_height * pooled_width * channels + c;
|
|
for (int ph = phstart; ph < phend; ++ph) {
|
|
for (int pw = pwstart; pw < pwend; ++pw) {
|
|
// figure out the pooling size
|
|
int hstart = ph * stride_h - pad_t;
|
|
int wstart = pw * stride_w - pad_l;
|
|
int hend = min(hstart + kernel_h, height);
|
|
int wend = min(wstart + kernel_w, width);
|
|
hstart = max(hstart, 0);
|
|
wstart = max(wstart, 0);
|
|
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
|
|
bottom_data[index] * pow(abs(bottom_data[index]), p - 2) /
|
|
pow(top_data_slice[(ph * pooled_width + pw) * channels], p - 1);
|
|
}
|
|
}
|
|
bottom_diff[index] = gradient;
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
|
auto& X = Input(0);
|
|
auto* Y = Output(0);
|
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1));
|
|
int output_size = Y->numel();
|
|
LpPoolForwardNCHW<float>
|
|
<<<CAFFE_GET_BLOCKS(output_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
output_size,
|
|
X.data<float>(),
|
|
X.dim32(0),
|
|
X.dim32(1),
|
|
X.dim32(2),
|
|
X.dim32(3),
|
|
Y->dim32(2),
|
|
Y->dim32(3),
|
|
kernel_h(),
|
|
kernel_w(),
|
|
stride_h(),
|
|
stride_w(),
|
|
pad_t(),
|
|
pad_l(),
|
|
Y->template mutable_data<float>(),
|
|
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
|
|
auto& X = Input(0);
|
|
auto* Y = Output(0);
|
|
ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3));
|
|
int output_size = Y->numel();
|
|
LpPoolForwardNHWC<float>
|
|
<<<CAFFE_GET_BLOCKS(output_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
output_size,
|
|
X.data<float>(),
|
|
X.dim32(0),
|
|
X.dim32(1),
|
|
X.dim32(2),
|
|
X.dim32(3),
|
|
Y->dim32(1),
|
|
Y->dim32(2),
|
|
kernel_h(),
|
|
kernel_w(),
|
|
stride_h(),
|
|
stride_w(),
|
|
pad_t(),
|
|
pad_l(),
|
|
Y->template mutable_data<float>(),
|
|
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
|
|
RunOnDeviceWithOrderNCHW() {
|
|
auto& X = Input(0);
|
|
auto& Y = Input(1);
|
|
auto& dY = Input(2);
|
|
CAFFE_ENFORCE_EQ(dY.dim(), 4);
|
|
|
|
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
|
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
|
|
LpPoolBackwardNCHW<float>
|
|
<<<CAFFE_GET_BLOCKS(X.numel()),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
X.numel(),
|
|
dY.data<float>(),
|
|
Y.data<float>(),
|
|
X.data<float>(),
|
|
X.dim32(0),
|
|
X.dim32(1),
|
|
X.dim32(2),
|
|
X.dim32(3),
|
|
dY.dim32(2),
|
|
dY.dim32(3),
|
|
kernel_h(),
|
|
kernel_w(),
|
|
stride_h(),
|
|
stride_w(),
|
|
pad_t(),
|
|
pad_l(),
|
|
dX->template mutable_data<float>(),
|
|
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
|
|
RunOnDeviceWithOrderNHWC() {
|
|
auto& X = Input(0);
|
|
auto& Y = Input(1);
|
|
auto& dY = Input(2);
|
|
CAFFE_ENFORCE_EQ(dY.dim(), 4);
|
|
|
|
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
|
ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(1), X.dim32(2)});
|
|
LpPoolBackwardNHWC<float>
|
|
<<<CAFFE_GET_BLOCKS(X.numel()),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
X.numel(),
|
|
dY.data<float>(),
|
|
Y.data<float>(),
|
|
X.data<float>(),
|
|
X.dim32(0),
|
|
X.dim32(1),
|
|
X.dim32(2),
|
|
X.dim32(3),
|
|
dY.dim32(1),
|
|
dY.dim32(2),
|
|
kernel_h(),
|
|
kernel_w(),
|
|
stride_h(),
|
|
stride_w(),
|
|
pad_t(),
|
|
pad_l(),
|
|
dX->template mutable_data<float>(),
|
|
OperatorBase::GetSingleArgument<float>("p", 2.0));
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(LpPool, PoolOp<float, CUDAContext, LpPoolFunctor>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
LpPoolGradient,
|
|
PoolGradientOp<float, CUDAContext, LpPoolFunctor>);
|
|
}
|