mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
[SJD] [RFC] force setting last progress time (#138615)
Summary: Currently, if watchdog + healthcheck are enabled via knobs but watchdog is disabled via SJD config, we observe a stuck when the watchdog loop attempts to open the watchdog file path. This is because the FileTimerClient that is usually set in TorchElasticWatchdog will not be set since disabling watchdog via SJD config bypasses the TorchElasticWatchdog initialization The workaround is to update the healthcheck time when calling `get_last_progress_time` Test Plan: Logs show that the progress time value is being changed despite client not being set Behavior when watchdog is enabled with SJD config is left unchanged Differential Revision: D64733766 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138615 Approved by: https://github.com/gag1jain
This commit is contained in:
parent
cdfe1bffd1
commit
c272526ea5
|
|
@ -179,6 +179,8 @@ class FileTimerServer:
|
|||
self._timers: Dict[Tuple[int, str], FileTimerRequest] = {}
|
||||
self._stop_signaled = False
|
||||
self._watchdog_thread: Optional[threading.Thread] = None
|
||||
|
||||
self._is_client_started = False
|
||||
if os.path.exists(self._file_path):
|
||||
os.remove(self._file_path)
|
||||
os.mkfifo(self._file_path)
|
||||
|
|
@ -249,6 +251,7 @@ class FileTimerServer:
|
|||
# 2. We are running the watchdog loop in a separate daemon
|
||||
# thread, which will not block the process to stop.
|
||||
with open(self._file_path) as fd:
|
||||
self._is_client_started = True
|
||||
while not self._stop_signaled:
|
||||
try:
|
||||
run_once = self._run_once
|
||||
|
|
@ -390,4 +393,4 @@ class FileTimerServer:
|
|||
return False
|
||||
|
||||
def get_last_progress_time(self) -> int:
|
||||
return self._last_progress_time
|
||||
return self._last_progress_time if self._is_client_started else int(time.time())
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user