mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[CUDA][avgpool2d] Fix backward launch bounds again for sm100, sm120 (#150640)
`__CUDA_ARCH__` is not visible in host code, which causes incorrect launch bounds and `too many resources requested for launch` on blackwell CC @atalman @malfet as we would want this in 2.7 @nWEIdia Pull Request resolved: https://github.com/pytorch/pytorch/pull/150640 Approved by: https://github.com/malfet, https://github.com/drisspg, https://github.com/atalman
This commit is contained in:
parent
73358d37da
commit
09c4da9325
|
|
@ -402,11 +402,12 @@ TORCH_IMPL_FUNC(avg_pool2d_backward_out_cuda) (
|
|||
bool use_divisor = divisor_override.has_value();
|
||||
const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
|
||||
constexpr int double_threads = 768;
|
||||
#else
|
||||
constexpr int double_threads = 1024;
|
||||
#endif
|
||||
cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
|
||||
const bool gesm10x = properties->major >= 10;
|
||||
int double_threads = 1024;
|
||||
if (gesm10x) {
|
||||
double_threads = 768;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
|
||||
"avg_pool2d_backward_out_cuda_frame",
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user