mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[DCP] Decrease checkpoint background process Gloo pg init timeout (#162760)
Summary: Sometimes checkpoint background process creation times out during gloo pg init. Attempting to destroy the process during that time can block the trainer thread until the timeout completes. This diff reduces the pg init timeout from 30m -> 10m to reduce the cleanup time. Test Plan: CI Rollback Plan: Differential Revision: D81724668 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162760 Approved by: https://github.com/meetv18
This commit is contained in:
parent
b2553a6ec4
commit
0925c644ed
|
|
@ -4,6 +4,7 @@ import logging
|
|||
import os
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from datetime import timedelta
|
||||
from enum import Enum
|
||||
from typing import Any, Optional, Union
|
||||
from uuid import uuid4
|
||||
|
|
@ -215,7 +216,9 @@ class _AsyncCheckpointProcess:
|
|||
"Initializing dist.ProcessGroup in checkpoint background process"
|
||||
)
|
||||
# NOTE: GLOO backend is enforced here.
|
||||
dist.init_process_group(backend=dist.Backend.GLOO)
|
||||
dist.init_process_group(
|
||||
backend=dist.Backend.GLOO, timeout=timedelta(seconds=600)
|
||||
)
|
||||
dist.barrier()
|
||||
|
||||
logger.info("Checkpoint background process is running...")
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user