pytorch/caffe2/operators/space_batch_op_gpu.cu
Richard Barnes 9409a3a39b Check kernel launches in caffe2/operators (#52240)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/52240

Test Plan: Sandcastle tests

Reviewed By: xush6528

Differential Revision: D26408330

fbshipit-source-id: 60779ba0e38c8f90e0e341c8faa2661e631112dd
2021-02-16 16:42:05 -08:00

182 lines
4.8 KiB
Plaintext

#include "caffe2/operators/space_batch_op.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
__global__ void SpaceToBatch(
int N,
int output_batch,
int output_depth,
int output_height,
int output_width,
int input_batch,
int input_depth,
int input_height,
int input_width,
const int pad_l,
const int pad_t,
int block_size,
const float* input,
float* output) {
CUDA_1D_KERNEL_LOOP(i, N) {
// Recall:
// const auto output_offset =
// ((out_b * output_depth + d) * output_height + out_h) * output_width +
// out_w;
const int out_w = i % output_width;
const int i_2 = i / output_width;
const int out_h = i_2 % output_height;
const int i_3 = i_2 / output_height;
const int d = i_3 % output_depth;
const int out_b = i_3 / output_depth;
const int in_b = out_b % input_batch;
const int offset_w = (out_b / input_batch) % block_size;
const int offset_h = (out_b / input_batch) / block_size;
const int in_h = out_h * block_size + offset_h - pad_t;
const int in_w = out_w * block_size + offset_w - pad_l;
if (in_h >= 0 && in_w >= 0 && in_h < input_height && in_w < input_width) {
const auto input_offset =
((in_b * input_depth + d) * input_height + in_h) * input_width +
in_w;
output[i] = input[input_offset];
} else {
output[i] = 0.0;
}
}
}
template <>
void spaceToBatch<CUDAContext>(
const Tensor& input,
int pad_t,
int pad_l,
int block_size,
Tensor* output,
CUDAContext* context) {
const int output_batch = output->dim32(0);
const int output_depth = output->dim32(1);
const int output_height = output->dim32(2);
const int output_width = output->dim32(3);
const int input_batch = input.dim32(0);
const int input_depth = input.dim32(1);
const int input_height = input.dim32(2);
const int input_width = input.dim32(3);
const int N = output->numel();
SpaceToBatch<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
N,
output_batch,
output_depth,
output_height,
output_width,
input_batch,
input_depth,
input_height,
input_width,
pad_l,
pad_t,
block_size,
input.data<float>(),
output->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
__global__ void BatchToSpace(
int N,
int output_batch,
int output_depth,
int output_height,
int output_width,
int input_batch,
int input_depth,
int input_height,
int input_width,
const int pad_l,
const int pad_t,
int block_size,
const float* input,
float* output) {
CUDA_1D_KERNEL_LOOP(i, N) {
// Recall:
// const auto input_offset = ((in_b * input_depth + d) *
// input_height + in_h) * input_width + in_w;
const int in_w = i % input_width;
const int i_2 = i / input_width;
const int in_h = i_2 % input_height;
const int i_3 = i_2 / input_height;
const int d = i_3 % input_depth;
const int in_b = i_3 / input_depth;
const int out_b = in_b % output_batch;
const int offset_w = (in_b / output_batch) % block_size;
const int offset_h = (in_b / output_batch) / block_size;
const int out_h = in_h * block_size + offset_h - pad_t;
const int out_w = in_w * block_size + offset_w - pad_l;
if (out_h >= 0 && out_w >= 0 && out_h < output_height &&
out_w < output_width) {
const auto output_offset =
((out_b * output_depth + d) * output_height + out_h) *
output_width +
out_w;
output[output_offset] = input[i];
}
}
}
template <>
void batchToSpace(
const Tensor& input,
int pad_t,
int pad_l,
int block_size,
Tensor* output,
CUDAContext* context) {
CAFFE_ENFORCE(input.dim() == 4);
CAFFE_ENFORCE(output->dim() == 4);
const int output_batch = output->dim32(0);
const int output_depth = output->dim32(1);
const int output_height = output->dim32(2);
const int output_width = output->dim32(3);
const int input_batch = input.dim32(0);
const int input_depth = input.dim32(1);
const int input_height = input.dim32(2);
const int input_width = input.dim32(3);
const int N = input.numel();
BatchToSpace<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
N,
output_batch,
output_depth,
output_height,
output_width,
input_batch,
input_depth,
input_height,
input_width,
pad_l,
pad_t,
block_size,
input.data<float>(),
output->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
REGISTER_CUDA_OPERATOR(SpaceToBatch, SpaceToBatchOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(BatchToSpace, BatchToSpaceOp<CUDAContext>);
}