mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[C10D] Make MultiProcContinuousTest less spammy (#160821)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/160821 Approved by: https://github.com/fduwjj ghstack dependencies: #160892
This commit is contained in:
parent
779fc29c04
commit
1ea918caf9
|
|
@ -1611,8 +1611,11 @@ class MultiProcContinuousTest(TestCase):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _init_pg(cls, rank, world_size, rdvz_file):
|
def _init_pg(cls, rank, world_size, rdvz_file):
|
||||||
assert rdvz_file is not None
|
assert rdvz_file is not None
|
||||||
|
# rank should be local_rank for tests running on <= 8gpus which is how all these tests are designed
|
||||||
|
# and we expect LOCAL_RANK set by torchrun. Setting it lets init_device_mesh set the device without
|
||||||
|
# issuing a warning
|
||||||
|
os.environ["LOCAL_RANK"] = str(rank)
|
||||||
store = c10d.FileStore(rdvz_file, world_size)
|
store = c10d.FileStore(rdvz_file, world_size)
|
||||||
|
|
||||||
# create nccl processgroup with opts
|
# create nccl processgroup with opts
|
||||||
c10d.init_process_group(
|
c10d.init_process_group(
|
||||||
backend=cls.backend_str(),
|
backend=cls.backend_str(),
|
||||||
|
|
@ -1649,7 +1652,7 @@ class MultiProcContinuousTest(TestCase):
|
||||||
cls._init_pg(rank, world_size, rdvz_file)
|
cls._init_pg(rank, world_size, rdvz_file)
|
||||||
|
|
||||||
# End of bootstrap
|
# End of bootstrap
|
||||||
logger.info("Setup complete")
|
logger.debug("Setup complete")
|
||||||
|
|
||||||
# Loop forever, waiting for a test name to run
|
# Loop forever, waiting for a test name to run
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -1674,7 +1677,7 @@ class MultiProcContinuousTest(TestCase):
|
||||||
completion_queue.put(enhanced_ex)
|
completion_queue.put(enhanced_ex)
|
||||||
|
|
||||||
# Termination
|
# Termination
|
||||||
logger.info("Terminating ...")
|
logger.debug("Terminating ...")
|
||||||
# Calling destroy_process_group when workers have exceptions
|
# Calling destroy_process_group when workers have exceptions
|
||||||
# while others are doing collectives will cause a deadlock since
|
# while others are doing collectives will cause a deadlock since
|
||||||
# it waits for enqueued collectives to finish.
|
# it waits for enqueued collectives to finish.
|
||||||
|
|
@ -1711,7 +1714,7 @@ class MultiProcContinuousTest(TestCase):
|
||||||
cls.processes.append(process)
|
cls.processes.append(process)
|
||||||
cls.task_queues.append(task_queue)
|
cls.task_queues.append(task_queue)
|
||||||
cls.completion_queues.append(completion_queue)
|
cls.completion_queues.append(completion_queue)
|
||||||
logger.info("Started process %s with pid %s", rank, process.pid) # noqa: UP031
|
logger.debug("Started process %s with pid %s", rank, process.pid) # noqa: UP031
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def setUpClass(cls):
|
def setUpClass(cls):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user