[CUDA][avgpool2d] Fix backward launch bounds again for sm100, sm120 (#150640)

`__CUDA_ARCH__` is not visible in host code, which causes incorrect launch bounds and `too many resources requested for launch` on blackwell

CC @atalman @malfet as we would want this in 2.7 @nWEIdia

Pull Request resolved: https://github.com/pytorch/pytorch/pull/150640
Approved by: https://github.com/malfet, https://github.com/drisspg, https://github.com/atalman
This commit is contained in:
Eddie Yan 2025-04-04 13:05:40 +00:00 committed by PyTorch MergeBot
parent 73358d37da
commit 09c4da9325

View File

@ -402,11 +402,12 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cuda) (
bool use_divisor = divisor_override.has_value();
const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
constexpr int double_threads = 768;
#else
constexpr int double_threads = 1024;
#endif
cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
const bool gesm10x = properties->major >= 10;
int double_threads = 1024;
if (gesm10x) {
double_threads = 768;
}
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
"avg_pool2d_backward_out_cuda_frame",