mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Revert D25325039: Check CUDA kernel launches (/fbcode/caffe2/)
Test Plan: revert-hammer
Differential Revision:
D25325039 (f5e9ffbc27)
Original commit changeset: 2043d6e63c7d
fbshipit-source-id: 5377dd2aa7c6f58c8641c956b7642c7c559bbc40
This commit is contained in:
parent
7a4a2df225
commit
bfa95f90a0
|
|
@ -112,7 +112,6 @@ bool GroupSpatialSoftmaxOp<float, CUDAContext>::RunOnDevice() {
|
|||
GroupSpatialSoftmaxKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, A, W, H, Xdata, Pdata, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -159,13 +158,11 @@ bool GroupSpatialSoftmaxGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
SumProbsKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
|
||||
context_.cuda_stream()>>>(
|
||||
N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Step 2: dX[i] = dX[i] - s
|
||||
SubSumKernel<<<CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0,
|
||||
context_.cuda_stream()>>>(
|
||||
N, A, W, H, sum_probs_.data<float>(), dXdata, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Step 3: dX[i] = Y[i] * dX[i]
|
||||
math::Mul<float, CUDAContext>(Y.size(), dXdata, Ydata, dXdata, &context_);
|
||||
|
|
|
|||
|
|
@ -253,7 +253,6 @@ bool PSRoIPoolOp<float, CUDAContext>::RunOnDevice() {
|
|||
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
|
||||
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(), output_dim_,
|
||||
group_size_, Y->mutable_data<float>(), A->mutable_data<int>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -277,7 +276,6 @@ bool PSRoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
|
||||
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
|
||||
output_dim_, dX->mutable_data<float>(), R.data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -149,7 +149,6 @@ bool RoIPoolFOp<float, CUDAContext>::RunOnDevice() {
|
|||
output_size, X.data<float>(), spatial_scale_, X.dim32(1), X.dim32(2),
|
||||
X.dim32(3), pooled_height_, pooled_width_, R.data<float>(),
|
||||
Y->mutable_data<float>(), A->mutable_data<int>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -174,7 +173,6 @@ bool RoIPoolFGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
dY.size(), dY.data<float>(), A.data<int>(), R.dim32(0), spatial_scale_,
|
||||
X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_,
|
||||
dX->mutable_data<float>(), R.data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -129,7 +129,6 @@ bool SelectSmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
|
|||
M, Y_hat.data<float>(), Y.data<float>(),
|
||||
L.data<float>(), buff_.mutable_data<float>(),
|
||||
S.data<float>(), beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Sum of all losses
|
||||
// al := sum_i l_i
|
||||
|
|
@ -176,7 +175,6 @@ bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
D, H, W, M, Y_hat.data<float>(), Y.data<float>(),
|
||||
L.data<float>(), d_Y_hat->mutable_data<float>(),
|
||||
d_avg_loss.data<float>(), scale_, S.data<float>(), beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,8 +93,6 @@ bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
|
|||
T.data<int>(),
|
||||
losses_.mutable_data<float>(),
|
||||
counts_.mutable_data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
|
||||
|
|
@ -108,7 +106,6 @@ bool SigmoidCrossEntropyLossOp<float, CUDAContext>::RunOnDevice() {
|
|||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Div<float, CUDAContext>(
|
||||
1, avg_loss_data, normalizer_data, avg_loss_data, &context_);
|
||||
}
|
||||
|
|
@ -138,7 +135,6 @@ bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
T.data<int>(),
|
||||
dX->mutable_data<float>(),
|
||||
counts_.mutable_data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
if (normalize_) {
|
||||
float* normalizer_data = normalizer_.mutable_data<float>();
|
||||
math::Sum<float, CUDAContext>(
|
||||
|
|
@ -149,7 +145,6 @@ bool SigmoidCrossEntropyLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Div<float, CUDAContext>(
|
||||
1,
|
||||
d_avg_loss.data<float>(),
|
||||
|
|
|
|||
|
|
@ -134,7 +134,6 @@ bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
|
|||
N, D, H, W, X.data<float>(), T.data<int>(),
|
||||
wp.data<float>(), gamma_, alpha_, num_classes_,
|
||||
losses_.mutable_data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
math::Sum<float, CUDAContext>(
|
||||
losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
|
||||
|
|
@ -166,7 +165,6 @@ bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
|
||||
wp.data<float>(), gamma_, alpha_, num_classes_,
|
||||
d_avg_loss.data<float>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
scale_,
|
||||
|
|
|
|||
|
|
@ -102,7 +102,6 @@ bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
|
|||
context_.cuda_stream()>>>(
|
||||
buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
|
||||
beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Element-wise weighted smooth l1 loss (can be used to specify a per-element
|
||||
// loss weight)
|
||||
|
|
@ -165,8 +164,6 @@ bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
context_.cuda_stream()>>>(
|
||||
buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
|
||||
d_avg_loss.data<float>(), scale_ / N, beta_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Element-wise scale by alpha_in and alpha_out
|
||||
math::Mul<float, CUDAContext>(
|
||||
d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
|
||||
|
|
|
|||
|
|
@ -176,7 +176,6 @@ bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
|
|||
<<<CAFFE_GET_BLOCKS(N * A * H * W), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
N, A, H, W, Xdata, P->mutable_data<float>(), num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Compute loss for each x,y location
|
||||
const int* Tdata = T.data<int>();
|
||||
|
|
@ -185,7 +184,6 @@ bool SoftmaxFocalLossOp<float, CUDAContext>::RunOnDevice() {
|
|||
0, context_.cuda_stream()>>>(
|
||||
N, A, H, W, P->data<float>(), Tdata, losses_.mutable_data<float>(),
|
||||
Wdata, gamma_, alpha_, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// sum the losses
|
||||
float* avg_loss_data = avg_loss->mutable_data<float>();
|
||||
|
|
@ -229,8 +227,6 @@ bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
0, context_.cuda_stream()>>>(
|
||||
N, A, H, W, Pdata, Tdata, buff_.mutable_data<float>(),
|
||||
Wdata, gamma_, alpha_, num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
// Compute the gradient with the weights
|
||||
const float* Bdata = buff_.data<float>();
|
||||
SoftmaxFocalLossGradientKernel
|
||||
|
|
@ -238,7 +234,6 @@ bool SoftmaxFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
0, context_.cuda_stream()>>>(
|
||||
N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data<float>(),
|
||||
dX->mutable_data<float>(), num_classes_);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
math::Scale<float, float, CUDAContext>(
|
||||
dX->size(),
|
||||
scale_,
|
||||
|
|
|
|||
|
|
@ -115,7 +115,6 @@ bool SpatialNarrowAsOp<CUDAContext>::DoRunWithType() {
|
|||
out_width,
|
||||
A.template data<T>(),
|
||||
C->template mutable_data<T>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -153,7 +152,6 @@ bool SpatialNarrowAsGradientOp<CUDAContext>::DoRunWithType() {
|
|||
out_width,
|
||||
dC.template data<T>(),
|
||||
dA->template mutable_data<T>());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -164,8 +164,6 @@ bool UpsampleNearestOp<float, CUDAContext>::RunOnDevice() {
|
|||
|
||||
upscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
|
||||
input_data, output_data, no_elements, scale_, d1, d2, d3);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -211,7 +209,6 @@ bool UpsampleNearestGradientOp<float, CUDAContext>::RunOnDevice() {
|
|||
math::Set<float, CUDAContext>(no_elements, 0.f, gradInput_data, &context_);
|
||||
downscale<<<blocks, threads, 0, context_.cuda_stream()>>>(
|
||||
gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,5 +26,4 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) {
|
|||
const int threads = 1024;
|
||||
const int blocks = (size + threads - 1) / threads;
|
||||
sigmoid_add_kernel<<<blocks, threads>>>(x, y, output, size);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,5 +20,4 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) {
|
|||
const int threads = 1024;
|
||||
const int blocks = (size + threads - 1) / threads;
|
||||
sigmoid_add_kernel<<<blocks, threads>>>(x, y, output, size);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,5 +20,4 @@ void tanh_add_cuda(const float* x, const float* y, float* output, int size) {
|
|||
const int threads = 1024;
|
||||
const int blocks = (size + threads - 1) / threads;
|
||||
tanh_add_kernel<<<blocks, threads>>>(x, y, output, size);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@ __global__ void waitClocks(const uint64_t count) {
|
|||
|
||||
void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) {
|
||||
waitClocks<<<1, 1, 0, stream.stream()>>>(clocks);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
int cudaNumDevices() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user