[CUDA][avgpool2d] Fix backward launch bounds again for sm100, sm120 (#150640)

`__CUDA_ARCH__` is not visible in host code, which causes incorrect launch bounds and `too many resources requested for launch` on blackwell CC @atalman @malfet as we would want this in 2.7 @nWEIdia Pull Request resolved: https://github.com/pytorch/pytorch/pull/150640 Approved by: https://github.com/malfet, https://github.com/drisspg, https://github.com/atalman
2025-12-06 12:20:52 +01:00 · 2025-04-04 13:05:40 +00:00 · 2025-04-04 13:05:40 +00:00 · 09c4da9325
commit 09c4da9325
parent 73358d37da
1 changed files with 6 additions and 5 deletions
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@ -402,11 +402,12 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cuda) (
  bool use_divisor = divisor_override.has_value();
  const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
-  constexpr int double_threads = 768;
-#else
-  constexpr int double_threads = 1024;
-#endif
+  cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
+  const bool gesm10x = properties->major >= 10;
+  int double_threads = 1024;
+  if (gesm10x) {
+    double_threads = 768;
+  }

  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
    "avg_pool2d_backward_out_cuda_frame",