Add warning for incorrected grad results at world size 1 (#154928)

Add warning for the issue discussed at https://github.com/pytorch/pytorch/issues/144045 Pull Request resolved: https://github.com/pytorch/pytorch/pull/154928 Approved by: https://github.com/weifengpy
2025-12-07 12:21:27 +01:00 · 2025-06-17 00:08:04 +00:00 · 2025-06-17 00:08:04 +00:00 · 82fb904140
commit 82fb904140
parent eb4cf59ecd
1 changed files with 1 additions and 0 deletions
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@ -569,6 +569,7 @@ def _get_gradient_divide_factors(
 ) -> Union[tuple[None, None], tuple[float, float]]:
    # For fp32/bf16, we do not need to worry about overflow/underflow, so we
    # use NCCL's built-in division to avoid separate div kernels
    # Warning: NCCL ReduceOp.AVG may produce incorrect results with world size 1.
    if reduce_dtype in (torch.float32, torch.bfloat16) and device_type != "mtia":
        return None, None
    data_parallel_size = reduce_scatter_group.size()