mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49145 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49105 (1) Add a safety check `C10_CUDA_KERNEL_LAUNCH_CHECK()` after each kernel launch. This diff only changes the files inside the directory /fbsource/fbcode/caffe2/modules/, /fbsource/fbcode/caffe2/fb/, /fbsource/fbcode/caffe2/test/. (2) Get rid of old check `AT_CUDA_CHECK(cudaGetLastError())` when necessary. Test Plan: Test build: ``` buck build mode/dev-nosan //caffe2/modules/detectron: buck test mode/dev-nosan //caffe2/modules/detectron: buck build mode/dev-nosan //caffe2/torch/fb/: buck test mode/dev-nosan //caffe2/torch/fb/: ``` To check for launches without checks: ``` python3 caffe2/torch/testing/check_kernel_launches.py ``` Make sure none of the updated files are in the returned list. Reviewed By: r-barnes Differential Revision: D25452852 fbshipit-source-id: d6657edab612c9e0fa99b29c68460be8b1a20064
224 lines
7.5 KiB
Plaintext
224 lines
7.5 KiB
Plaintext
/**
|
|
* Copyright (c) 2016-present, Facebook, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/**
|
|
* Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu
|
|
*
|
|
* Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
|
* Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
|
* Copyright (c) 2011-2013 NYU (Clement Farabet)
|
|
* Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert,
|
|
* Leon Bottou, Iain Melvin, Jason Weston)
|
|
* Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
|
* Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
|
|
* Samy Bengio, Johnny Mariethoz)
|
|
*
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the names of NEC Laboratories American and IDIAP Research
|
|
* Institute nor the names of its contributors may be used to endorse or
|
|
* promote products derived from this software without specific prior
|
|
* written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "modules/detectron/upsample_nearest_op.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) {
|
|
int x, y, z, w;
|
|
w = ii % d3;
|
|
ii = ii/d3;
|
|
z = ii % d2;
|
|
ii = ii/d2;
|
|
y = ii % d1;
|
|
ii = ii/d1;
|
|
x = ii;
|
|
w = w/scale_factor;
|
|
z = z/scale_factor;
|
|
d2 /= scale_factor;
|
|
d3 /= scale_factor;
|
|
return (((x*d1+y)*d2)+z)*d3+w;
|
|
}
|
|
|
|
__device__ int translate_idx_inv(
|
|
int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) {
|
|
int x, y, z, w;
|
|
w = ii % d3;
|
|
ii = ii/d3;
|
|
z = ii % d2;
|
|
ii = ii/d2;
|
|
y = ii % d1;
|
|
ii = ii/d1;
|
|
x = ii;
|
|
w = w*scale_factor+off_x;
|
|
z = z*scale_factor+off_y;
|
|
d2 *= scale_factor;
|
|
d3 *= scale_factor;
|
|
return (((x*d1+y)*d2)+z)*d3+w;
|
|
}
|
|
|
|
__global__ void upscale(const float *input, float *output, long no_elements,
|
|
int scale_factor, int d1, int d2, int d3) {
|
|
long ii = threadIdx.x + blockDim.x * blockIdx.x;
|
|
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
|
|
if (ii >= no_elements) return;
|
|
int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
|
|
output[ii]=input[ipidx];
|
|
}
|
|
|
|
__global__ void downscale(float *gradInput_data, const float *gradOutput_data,
|
|
long no_elements, int scale_factor, int d1, int d2,
|
|
int d3) {
|
|
long ii = threadIdx.x + blockDim.x * blockIdx.x;
|
|
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
|
|
if (ii >= no_elements) return;
|
|
for (int i=0; i < scale_factor; i++){
|
|
for(int j=0; j < scale_factor; j++){
|
|
int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
|
|
gradInput_data[ii] += gradOutput_data[ipidx];
|
|
}
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template<>
|
|
bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0);
|
|
auto* Y = Output(0);
|
|
|
|
vector<int64_t> out_shape;
|
|
for (int i = 0; i < X.ndim(); ++i) {
|
|
out_shape.push_back(X.dim32(i));
|
|
}
|
|
out_shape[X.ndim() - 1] *= scale_;
|
|
out_shape[X.ndim() - 2] *= scale_;
|
|
Y->Resize(out_shape);
|
|
|
|
int d1;
|
|
int d2;
|
|
int d3;
|
|
if (X.ndim() == 3) {
|
|
d1 = Y->dim32(0);
|
|
d2 = Y->dim32(1);
|
|
d3 = Y->dim32(2);
|
|
} else {
|
|
d1 = Y->dim32(1);
|
|
d2 = Y->dim32(2);
|
|
d3 = Y->dim32(3);
|
|
}
|
|
long no_elements = Y->size();
|
|
|
|
const float *input_data = X.data<float>();
|
|
float *output_data = Y->mutable_data<float>();
|
|
|
|
// cuda blocks & threads:
|
|
long nthreads = 256;
|
|
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
|
|
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
|
|
// TODO: When we move to SM 3.5 we should update this
|
|
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
|
|
long n_yblocks = (long)ceil(
|
|
(float)no_elements / (float)(n_xblocks * nthreads));
|
|
CAFFE_ENFORCE(n_yblocks <= 65535);
|
|
dim3 blocks(n_xblocks, n_yblocks);
|
|
dim3 threads(nthreads);
|
|
|
|
upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
|
|
input_data, output_data, no_elements, scale_, d1, d2, d3);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
template<>
|
|
bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
|
|
auto& X = Input(0); // Original input to "forward" op
|
|
auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op
|
|
// (aka "gradOutput")
|
|
auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
|
|
// (aka "gradInput")
|
|
|
|
dX->ResizeLike(X);
|
|
float *gradInput_data = dX->mutable_data<float>();
|
|
const float *gradOutput_data = dY.data<float>();
|
|
|
|
int d1;
|
|
int d2;
|
|
int d3;
|
|
if (dX->ndim() == 3) {
|
|
d1 = dX->dim32(0);
|
|
d2 = dX->dim32(1);
|
|
d3 = dX->dim32(2);
|
|
} else {
|
|
d1 = dX->dim32(1);
|
|
d2 = dX->dim32(2);
|
|
d3 = dX->dim32(3);
|
|
}
|
|
long no_elements = dX->size();
|
|
|
|
// cuda blocks & threads:
|
|
long nthreads = 256;
|
|
// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
|
|
// 65535 for SM 2.x, 2^32 -1 for >= 3.0
|
|
// TODO: When we move to SM 3.5 we should update this
|
|
long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
|
|
long n_yblocks = (long)ceil(
|
|
(float)no_elements / (float)(n_xblocks * nthreads));
|
|
CAFFE_ENFORCE(n_yblocks <= 65535);
|
|
dim3 blocks(n_xblocks, n_yblocks);
|
|
dim3 threads(nthreads);
|
|
|
|
math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
|
|
downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
|
|
gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(UpsampleNearest,
|
|
UpsampleNearestOp<float, CUDAContext>);
|
|
REGISTER_CUDA_OPERATOR(UpsampleNearestGradient,
|
|
UpsampleNearestGradientOp<float, CUDAContext>);
|
|
} // namespace caffe2
|