Add warning for incorrected grad results at world size 1 (#154928)

Add warning for the issue discussed at https://github.com/pytorch/pytorch/issues/144045

Pull Request resolved: https://github.com/pytorch/pytorch/pull/154928
Approved by: https://github.com/weifengpy
This commit is contained in:
yifanmao 2025-06-17 00:08:04 +00:00 committed by PyTorch MergeBot
parent eb4cf59ecd
commit 82fb904140

View File

@ -569,6 +569,7 @@ def _get_gradient_divide_factors(
) -> Union[tuple[None, None], tuple[float, float]]:
# For fp32/bf16, we do not need to worry about overflow/underflow, so we
# use NCCL's built-in division to avoid separate div kernels
# Warning: NCCL ReduceOp.AVG may produce incorrect results with world size 1.
if reduce_dtype in (torch.float32, torch.bfloat16) and device_type != "mtia":
return None, None
data_parallel_size = reduce_scatter_group.size()