mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16679 Codemod generated with clangr shard mode, 25 files per diff, Reviewed By: houseroad Differential Revision: D13929450 fbshipit-source-id: fcc222744c28b41f2cedffc0c2ef5d04aceaa5af
217 lines
6.6 KiB
Plaintext
217 lines
6.6 KiB
Plaintext
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/operators/integral_image_op.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
__global__ void RowPassKernel(
|
|
int count,
|
|
int rows_out,
|
|
int cols_out,
|
|
int chans,
|
|
const float* in,
|
|
float* out) {
|
|
CUDA_1D_KERNEL_LOOP(i, count) {
|
|
// Figure out which row, channel, and batch element we're processing
|
|
int row = i % rows_out;
|
|
int chan = (i / rows_out) % chans;
|
|
int ind = i / rows_out / chans;
|
|
// Input is (H, W) and output is (H + 1, W + 1)
|
|
int rows_in = rows_out - 1;
|
|
int cols_in = cols_out - 1;
|
|
// Row pointer to input data
|
|
// Input data is shift (-1, -1) relative to output data, hence row - 1
|
|
const float* row_in_data =
|
|
in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
|
|
// Row pointer to output data
|
|
float* row_out_data =
|
|
out + cols_out * (row + rows_out * (chan + ind * chans));
|
|
// The first row and first column of the output is all zeros
|
|
row_out_data[0] = 0.;
|
|
if (row == 0) {
|
|
for (int i = 1; i < cols_out; ++i) {
|
|
row_out_data[i] = 0.;
|
|
}
|
|
} else {
|
|
for (int i = 1; i < cols_out; ++i) {
|
|
// Recall that input data is shift (-1, -1) relative to the output,
|
|
// hence i - 1
|
|
row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void RowPassGradientKernel(
|
|
int count,
|
|
int rows_out,
|
|
int cols_out,
|
|
int chans,
|
|
const float* in,
|
|
float* out) {
|
|
CUDA_1D_KERNEL_LOOP(i, count) {
|
|
// Figure out which row, channel, and batch element we're processing
|
|
int row = i % rows_out;
|
|
int chan = (i / rows_out) % chans;
|
|
int ind = i / rows_out / chans;
|
|
// Input in (H + 1, W + 1) and output is (H + 1, W)
|
|
int rows_in = rows_out;
|
|
int cols_in = cols_out + 1;
|
|
// Col pointer to input data
|
|
const float* row_in_data =
|
|
in + cols_in * (row + rows_in * (chan + ind * chans));
|
|
// Col pointer to output data
|
|
float* row_out_data =
|
|
out + cols_out * (row + rows_out * (chan + ind * chans));
|
|
row_out_data[0] = row_in_data[0];
|
|
for (int i = 1; i < cols_out; ++i) {
|
|
row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void
|
|
ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
|
|
CUDA_1D_KERNEL_LOOP(i, count) {
|
|
// Figure out which col, channel, and batch element we're processing
|
|
int col = i % cols_out;
|
|
int chan = (i / cols_out) % chans;
|
|
int ind = i / cols_out / chans;
|
|
float* col_out_data =
|
|
out + col + cols_out * rows_out * (chan + ind * chans);
|
|
for (int i = 1; i < rows_out; ++i) {
|
|
col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void ColPassGradientKernel(
|
|
int count,
|
|
int rows_out,
|
|
int cols_out,
|
|
int chans,
|
|
const float* in,
|
|
float* out) {
|
|
CUDA_1D_KERNEL_LOOP(i, count) {
|
|
// Figure out which col, channel, and batch element we're processing
|
|
int col = i % cols_out;
|
|
int chan = (i / cols_out) % chans;
|
|
int ind = i / cols_out / chans;
|
|
// Input is (H + 1, W) and output is (H, W)
|
|
int rows_in = rows_out + 1;
|
|
int cols_in = cols_out;
|
|
// Col pointer to input data
|
|
const float* col_in_data =
|
|
in + col + cols_in * rows_in * (chan + ind * chans);
|
|
// Col pointer to output data
|
|
float* col_out_data =
|
|
out + col + cols_out * rows_out * (chan + ind * chans);
|
|
col_out_data[0] = col_in_data[0];
|
|
for (int i = 1; i < rows_out; ++i) {
|
|
col_out_data[i * cols_out] =
|
|
col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0);
|
|
|
|
CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement");
|
|
|
|
// Input is (N, C, H, W)
|
|
// Output is (N, C, H + 1, W + 1)
|
|
vector<int64_t> out_shape(X.sizes().vec());
|
|
out_shape[2] += 1; // H + 1 output size
|
|
out_shape[3] += 1; // W + 1 output size
|
|
auto* Y = Output(0, out_shape, at::dtype<float>());
|
|
|
|
const int chans = X.dim32(1);
|
|
const int rows_out = Y->dim32(2);
|
|
const int cols_out = Y->dim32(3);
|
|
// Integral image over rows of input X
|
|
const int row_pass_size = X.dim32(0) * chans * rows_out;
|
|
RowPassKernel<<<
|
|
CAFFE_GET_BLOCKS(row_pass_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
row_pass_size,
|
|
rows_out,
|
|
cols_out,
|
|
chans,
|
|
X.data<float>(),
|
|
Y->template mutable_data<float>());
|
|
// Integral image over columns of the integral image over rows
|
|
const int col_pass_size = X.dim32(0) * chans * cols_out;
|
|
ColPassKernel<<<
|
|
CAFFE_GET_BLOCKS(col_pass_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
col_pass_size,
|
|
rows_out,
|
|
cols_out,
|
|
chans,
|
|
Y->template mutable_data<float>());
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0); // Original input to "forward" op
|
|
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
|
|
// (aka "gradOutput")
|
|
|
|
auto* dX = Output(
|
|
0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
|
|
// "forward" op (aka "gradInput")
|
|
|
|
// Row pass reduces shape of dY from (N, C, H + 1, W + 1)
|
|
// to (N, C, H + 1, W)
|
|
// Col pass reduces shape to (N, C, H, W)
|
|
vector<int64_t> row_pass_shape(dY.sizes().vec());
|
|
row_pass_shape[3] -= 1;
|
|
ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA));
|
|
const int chans = row_pass_buffer_.dim32(1);
|
|
const int rows_out = row_pass_buffer_.dim32(2);
|
|
const int cols_out = row_pass_buffer_.dim32(3);
|
|
// Integral image over rows of input X
|
|
const int row_pass_size = X.dim32(0) * chans * rows_out;
|
|
RowPassGradientKernel<<<
|
|
CAFFE_GET_BLOCKS(row_pass_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
row_pass_size,
|
|
rows_out,
|
|
cols_out,
|
|
chans,
|
|
dY.data<float>(),
|
|
row_pass_buffer_.mutable_data<float>());
|
|
// Integral image over columns of the integral image over rows
|
|
const int col_pass_size = X.dim32(0) * chans * cols_out;
|
|
ColPassGradientKernel<<<
|
|
CAFFE_GET_BLOCKS(col_pass_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(
|
|
col_pass_size,
|
|
rows_out - 1,
|
|
cols_out,
|
|
chans,
|
|
row_pass_buffer_.data<float>(),
|
|
dX->template mutable_data<float>());
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
IntegralImageGradient,
|
|
IntegralImageGradientOp<float, CUDAContext>);
|
|
|
|
} // namespace caffe2
|