diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py index 174c89aa98a..fa6abc8794b 100644 --- a/torch/distributed/elastic/multiprocessing/errors/__init__.py +++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py @@ -79,9 +79,9 @@ __all__ = [ logger = get_logger(__name__) -JSON = dict +JSON = dict[str, Any] -_EMPTY_ERROR_DATA = {"message": ""} +_EMPTY_ERROR_DATA: dict[str, Any] = {"message": ""} _NOT_AVAILABLE = "" _R = TypeVar("_R") @@ -143,6 +143,10 @@ class ProcessFailure: f" received by PID {self.pid}" ) else: + self.error_file_data["errorTraits"] = { + "category": "system_terminated_error", + "retryability": "False", + } self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html" def _get_error_data(self, error_file_data: dict[str, Any]) -> tuple[str, int]: