[c10d][fr] Fix another bug when we should continue when the op list is empty (#151798)

Differential Revision: D73375318

We shouldn't check the op list when it is empty. And later, when it is empty we pops it out from the queue we will check for collective matching. Added a unit test for this case and also covered the case fixed https://github.com/pytorch/pytorch/pull/151683 in the unit test as well.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/151798
Approved by: https://github.com/d4l3k, https://github.com/wconstab, https://github.com/fegin
This commit is contained in:
Junjie Wang (PyTorch) 2025-04-22 04:43:31 +00:00 committed by PyTorch MergeBot
parent 6f327128a9
commit 95abc0f515
2 changed files with 24 additions and 0 deletions

View File

@ -338,6 +338,29 @@ class FlightRecorderE2ETest(TestCase):
db.collectives[0].collective_name, "nccl:REDUCE_SCATTER_coalesced"
)
self.assertEqual(db.collectives[0].pass_check, True)
# Test case 6: empty coalesced call on rank 0 case.
details6 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
# sequence ID should not increase for coalesced collectives
details6["dump_file_rank_0"]["entries"].append(
create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
)
details6["dump_file_rank_1"]["entries"].append(
create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
)
details6["dump_file_rank_1"]["entries"].append(
create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
)
details6["dump_file_rank_1"]["entries"].append(
create_one_entry(2, "_reduce_oop", [[4, 4]], [[4, 4]])
)
details6["dump_file_rank_1"]["entries"].append(
create_one_entry(3, "REDUCE_SCATTER_coalesced", [[]], [[]])
)
db = build_db(details6, args, version)
self.assertEqual(len(db.collectives), 2)
self.assertEqual(db.collectives[1].collective_name, "nccl:_reduce_oop")
self.assertEqual(db.collectives[1].record_id, 1)
self.assertEqual(db.collectives[1].pass_check, True)
if __name__ == "__main__":

View File

@ -265,6 +265,7 @@ def match_coalesced_groups_with_non_p2p(
for rank, op_list in all_ops.items():
if not op_list:
logger.error("Rank %s has an empty op list.", rank)
continue
if op_list[-1].type == "coalesced" and is_p2p:
op_list.pop(-1)