pytorch/caffe2/operators/integral_image_op.cu

#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/integral_image_op.h"

namespace caffe2 {

namespace {
__global__ void RowPassKernel(
    int count,
    int rows_out,
    int cols_out,
    int chans,
    const float* in,
    float* out) {
  CUDA_1D_KERNEL_LOOP(i, count) {
    // Figure out which row, channel, and batch element we're processing
    int row = i % rows_out;
    int chan = (i / rows_out) % chans;
    int ind = i / rows_out / chans;
    // Input is (H, W) and output is (H + 1, W + 1)
    int rows_in = rows_out - 1;
    int cols_in = cols_out - 1;
    // Row pointer to input data
    // Input data is shift (-1, -1) relative to output data, hence row - 1
    const float* row_in_data =
        in + cols_in * ((row - 1) + rows_in * (chan + ind * chans));
    // Row pointer to output data
    float* row_out_data =
        out + cols_out * (row + rows_out * (chan + ind * chans));
    // The first row and first column of the output is all zeros
    row_out_data[0] = 0.;
    if (row == 0) {
      for (int i = 1; i < cols_out; ++i) {
        row_out_data[i] = 0.;
      }
    } else {
      for (int i = 1; i < cols_out; ++i) {
        // Recall that input data is shift (-1, -1) relative to the output,
        // hence i - 1
        row_out_data[i] = row_out_data[i - 1] + row_in_data[i - 1];
      }
    }
  }
}

__global__ void RowPassGradientKernel(
    int count,
    int rows_out,
    int cols_out,
    int chans,
    const float* in,
    float* out) {
  CUDA_1D_KERNEL_LOOP(i, count) {
    // Figure out which row, channel, and batch element we're processing
    int row = i % rows_out;
    int chan = (i / rows_out) % chans;
    int ind = i / rows_out / chans;
    // Input in (H + 1, W + 1) and output is (H + 1, W)
    int rows_in = rows_out;
    int cols_in = cols_out + 1;
    // Col pointer to input data
    const float* row_in_data =
        in + cols_in * (row + rows_in * (chan + ind * chans));
    // Col pointer to output data
    float* row_out_data =
        out + cols_out * (row + rows_out * (chan + ind * chans));
    row_out_data[0] = row_in_data[0];
    for (int i = 1; i < cols_out; ++i) {
      row_out_data[i] = row_out_data[i - 1] + row_in_data[i];
    }
  }
}

__global__ void
ColPassKernel(int count, int rows_out, int cols_out, int chans, float* out) {
  CUDA_1D_KERNEL_LOOP(i, count) {
    // Figure out which col, channel, and batch element we're processing
    int col = i % cols_out;
    int chan = (i / cols_out) % chans;
    int ind = i / cols_out / chans;
    float* col_out_data =
        out + col + cols_out * rows_out * (chan + ind * chans);
    for (int i = 1; i < rows_out; ++i) {
      col_out_data[i * cols_out] += col_out_data[(i - 1) * cols_out];
    }
  }
}

__global__ void ColPassGradientKernel(
    int count,
    int rows_out,
    int cols_out,
    int chans,
    const float* in,
    float* out) {
  CUDA_1D_KERNEL_LOOP(i, count) {
    // Figure out which col, channel, and batch element we're processing
    int col = i % cols_out;
    int chan = (i / cols_out) % chans;
    int ind = i / cols_out / chans;
    // Input is (H + 1, W) and output is (H, W)
    int rows_in = rows_out + 1;
    int cols_in = cols_out;
    // Col pointer to input data
    const float* col_in_data =
        in + col + cols_in * rows_in * (chan + ind * chans);
    // Col pointer to output data
    float* col_out_data =
        out + col + cols_out * rows_out * (chan + ind * chans);
    col_out_data[0] = col_in_data[0];
    for (int i = 1; i < rows_out; ++i) {
      col_out_data[i * cols_out] =
          col_out_data[(i - 1) * cols_out] + col_in_data[i * cols_in];
    }
  }
}

} // namespace

template <>
bool IntegralImageOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);

  CAFFE_ENFORCE(X.dim() == 4, "Only supports 4D tensors for the momement");

  // Input is (N, C, H, W)
  // Output is (N, C, H + 1, W + 1)
  vector<int64_t> out_shape(X.sizes().vec());
  out_shape[2] += 1; // H + 1 output size
  out_shape[3] += 1; // W + 1 output size
  auto* Y = Output(0, out_shape, at::dtype<float>());

  const int chans = X.dim32(1);
  const int rows_out = Y->dim32(2);
  const int cols_out = Y->dim32(3);
  // Integral image over rows of input X
  const int row_pass_size = X.dim32(0) * chans * rows_out;
  RowPassKernel<<<
      CAFFE_GET_BLOCKS(row_pass_size),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      row_pass_size,
      rows_out,
      cols_out,
      chans,
      X.data<float>(),
      Y->template mutable_data<float>());
  // Integral image over columns of the integral image over rows
  const int col_pass_size = X.dim32(0) * chans * cols_out;
  ColPassKernel<<<
      CAFFE_GET_BLOCKS(col_pass_size),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      col_pass_size,
      rows_out,
      cols_out,
      chans,
      Y->template mutable_data<float>());
  return true;
}

template <>
bool IntegralImageGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0); // Original input to "forward" op
  auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
                       // (aka "gradOutput")

  auto* dX = Output(
      0, X.sizes(), at::dtype<float>()); // Gradient of net w.r.t. input to
                                         // "forward" op (aka "gradInput")

  // Row pass reduces shape of dY from (N, C, H + 1, W + 1)
  // to (N, C, H + 1, W)
  // Col pass reduces shape to (N, C, H, W)
  vector<int64_t> row_pass_shape(dY.sizes().vec());
  row_pass_shape[3] -= 1;
  ReinitializeTensor(&row_pass_buffer_, row_pass_shape, at::dtype<float>().device(CUDA));
  const int chans = row_pass_buffer_.dim32(1);
  const int rows_out = row_pass_buffer_.dim32(2);
  const int cols_out = row_pass_buffer_.dim32(3);
  // Integral image over rows of input X
  const int row_pass_size = X.dim32(0) * chans * rows_out;
  RowPassGradientKernel<<<
      CAFFE_GET_BLOCKS(row_pass_size),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      row_pass_size,
      rows_out,
      cols_out,
      chans,
      dY.data<float>(),
      row_pass_buffer_.mutable_data<float>());
  // Integral image over columns of the integral image over rows
  const int col_pass_size = X.dim32(0) * chans * cols_out;
  ColPassGradientKernel<<<
      CAFFE_GET_BLOCKS(col_pass_size),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      col_pass_size,
      rows_out - 1,
      cols_out,
      chans,
      row_pass_buffer_.data<float>(),
      dX->template mutable_data<float>());
  return true;
}

REGISTER_CUDA_OPERATOR(IntegralImage, IntegralImageOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
    IntegralImageGradient,
    IntegralImageGradientOp<float, CUDAContext>);

} // namespace caffe2