mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
Revert "Fixing NCCL abort hang issue when a ProcessGroupNCCL manages multiple ncclComms (#150690)"
This reverts commit 91173ff89a.
Reverted https://github.com/pytorch/pytorch/pull/150690 on behalf of https://github.com/atalman due to failing internal test ([comment](https://github.com/pytorch/pytorch/pull/150690#issuecomment-2787905966))
This commit is contained in:
parent
27ded359a5
commit
d9f47c75de
|
|
@ -1371,9 +1371,6 @@ void ProcessGroupNCCL::abortCommsFromMap(
|
|||
const std::optional<std::string>& abortReason) {
|
||||
// The process may control multiple devices, loop through the communicators on
|
||||
// each device
|
||||
// NCCL expects Group abort when there are multiple communicators created in a
|
||||
// device.
|
||||
groupStart();
|
||||
for (auto& it : ncclCommsMap) {
|
||||
auto& devName = it.first;
|
||||
auto& ncclComm = it.second;
|
||||
|
|
@ -1394,7 +1391,6 @@ void ProcessGroupNCCL::abortCommsFromMap(
|
|||
VLOG(2) << logPrefix() << "ProcessGroupNCCL destroyed "
|
||||
<< " communicator on CUDA device: " << devName;
|
||||
}
|
||||
groupEnd();
|
||||
}
|
||||
|
||||
// Abort all communicators on this rank
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user