mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Add warning for incorrected grad results at world size 1 (#154928)
Add warning for the issue discussed at https://github.com/pytorch/pytorch/issues/144045 Pull Request resolved: https://github.com/pytorch/pytorch/pull/154928 Approved by: https://github.com/weifengpy
This commit is contained in:
parent
eb4cf59ecd
commit
82fb904140
|
|
@ -569,6 +569,7 @@ def _get_gradient_divide_factors(
|
||||||
) -> Union[tuple[None, None], tuple[float, float]]:
|
) -> Union[tuple[None, None], tuple[float, float]]:
|
||||||
# For fp32/bf16, we do not need to worry about overflow/underflow, so we
|
# For fp32/bf16, we do not need to worry about overflow/underflow, so we
|
||||||
# use NCCL's built-in division to avoid separate div kernels
|
# use NCCL's built-in division to avoid separate div kernels
|
||||||
|
# Warning: NCCL ReduceOp.AVG may produce incorrect results with world size 1.
|
||||||
if reduce_dtype in (torch.float32, torch.bfloat16) and device_type != "mtia":
|
if reduce_dtype in (torch.float32, torch.bfloat16) and device_type != "mtia":
|
||||||
return None, None
|
return None, None
|
||||||
data_parallel_size = reduce_scatter_group.size()
|
data_parallel_size = reduce_scatter_group.size()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user