mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
[c10d][fr] Fix another bug when we should continue when the op list is empty (#151798)
Differential Revision: D73375318 We shouldn't check the op list when it is empty. And later, when it is empty we pops it out from the queue we will check for collective matching. Added a unit test for this case and also covered the case fixed https://github.com/pytorch/pytorch/pull/151683 in the unit test as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/151798 Approved by: https://github.com/d4l3k, https://github.com/wconstab, https://github.com/fegin
This commit is contained in:
parent
6f327128a9
commit
95abc0f515
|
|
@ -338,6 +338,29 @@ class FlightRecorderE2ETest(TestCase):
|
|||
db.collectives[0].collective_name, "nccl:REDUCE_SCATTER_coalesced"
|
||||
)
|
||||
self.assertEqual(db.collectives[0].pass_check, True)
|
||||
# Test case 6: empty coalesced call on rank 0 case.
|
||||
details6 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
|
||||
# sequence ID should not increase for coalesced collectives
|
||||
details6["dump_file_rank_0"]["entries"].append(
|
||||
create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
|
||||
)
|
||||
details6["dump_file_rank_1"]["entries"].append(
|
||||
create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
|
||||
)
|
||||
details6["dump_file_rank_1"]["entries"].append(
|
||||
create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
|
||||
)
|
||||
details6["dump_file_rank_1"]["entries"].append(
|
||||
create_one_entry(2, "_reduce_oop", [[4, 4]], [[4, 4]])
|
||||
)
|
||||
details6["dump_file_rank_1"]["entries"].append(
|
||||
create_one_entry(3, "REDUCE_SCATTER_coalesced", [[]], [[]])
|
||||
)
|
||||
db = build_db(details6, args, version)
|
||||
self.assertEqual(len(db.collectives), 2)
|
||||
self.assertEqual(db.collectives[1].collective_name, "nccl:_reduce_oop")
|
||||
self.assertEqual(db.collectives[1].record_id, 1)
|
||||
self.assertEqual(db.collectives[1].pass_check, True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -265,6 +265,7 @@ def match_coalesced_groups_with_non_p2p(
|
|||
for rank, op_list in all_ops.items():
|
||||
if not op_list:
|
||||
logger.error("Rank %s has an empty op list.", rank)
|
||||
continue
|
||||
if op_list[-1].type == "coalesced" and is_p2p:
|
||||
op_list.pop(-1)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user