mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
[torch/elastic] Pretty print the failure message captured by @record (#64036)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/64036 This PR slightly revises the implementation of the internal `_format_failure()` method in order to pretty print the error message captured in a subprocess by the `record` annotation. With this PR a failure log is formatted as below: ``` Root Cause: [0]: time: 2021-08-26_17:12:07 rank: 0 (local_rank: 0) exitcode: 1 (pid: 8045) error_file: /tmp/torchelastic_6cj9eppm/6d9d844a-6ce4-4838-93ed-1639a9525b00_rec9kuv3/attempt_0/0/error.json msg: { "message": "ValueError: Test", "extraInfo": { "py_callstack": [ " File \"/data/home/balioglu/fail.py\", line 7, in <module>\n main()\n", " File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 373, in wrapper\n error_handler.record_exception(e)\n", " File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 86, in record_exception\n _write_error(e, self._get_error_file_path())\n", " File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 26, in _write_error\n \"py_callstack\": traceback.format_stack(),\n" ], "timestamp": "1629997927" } } ``` in contrast to the old formatting: ``` Root Cause: [0]: time: 2021-08-26_17:15:50 rank: 0 (local_rank: 0) exitcode: 1 (pid: 9417) error_file: /tmp/torchelastic_22pwarnq/19f22638-848c-4b8f-8379-677f34fc44e7_u43o9vs7/attempt_0/0/error.json msg: "{'message': 'ValueError: Test', 'extraInfo': {'py_callstack': 'Traceback (most recent call last):\n File "/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 351, in wrapper\n return f(*args, **kwargs)\n File "/data/home/balioglu/fail.py", line 5, in main\n raise ValueError("BALIOGLU")\nValueError: BALIOGLU\n', 'timestamp': '1629998150'}}" ``` ghstack-source-id: 136761768 Test Plan: Run the existing unit tests. Reviewed By: kiukchung Differential Revision: D30579025 fbshipit-source-id: 37df0b7c7ec9b620355766122986c2c77e8495ae
This commit is contained in:
parent
5a12cb611f
commit
d8d8e4902a
|
|
@ -165,7 +165,7 @@ _FAILURE_FORMAT_TEMPLATE = """[${idx}]:
|
|||
rank: ${rank} (local_rank: ${local_rank})
|
||||
exitcode: ${exitcode} (pid: ${pid})
|
||||
error_file: ${error_file}
|
||||
msg: \"${message}\""""
|
||||
msg: ${message}"""
|
||||
|
||||
# extra new lines before and after are intentional
|
||||
_MSG_FORMAT_TEMPLATE = """
|
||||
|
|
@ -258,6 +258,19 @@ class ChildFailedError(Exception):
|
|||
def _format_failure(
|
||||
self, idx: int, rank: int, failure: ProcessFailure
|
||||
) -> Tuple[str, int]:
|
||||
if isinstance(failure.message, str):
|
||||
msg = '"' + failure.message + '"'
|
||||
else:
|
||||
try:
|
||||
dmp = json.dumps(failure.message, indent=2)
|
||||
except ValueError:
|
||||
msg = failure.message
|
||||
else:
|
||||
msg = os.linesep
|
||||
# Indent by 4 chars.
|
||||
for l in dmp.splitlines():
|
||||
msg += f" {l}{os.linesep}"
|
||||
|
||||
fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
|
||||
idx=idx,
|
||||
time=failure.timestamp_isoformat(),
|
||||
|
|
@ -266,7 +279,7 @@ class ChildFailedError(Exception):
|
|||
exitcode=failure.exitcode,
|
||||
pid=failure.pid,
|
||||
error_file=failure.error_file,
|
||||
message=failure.message,
|
||||
message=msg,
|
||||
)
|
||||
width = 0
|
||||
for line in fmt.split("\n"):
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ def _write_error(e: BaseException, error_file: Optional[str]):
|
|||
"message": {
|
||||
"message": f"{type(e).__name__}: {e}",
|
||||
"extraInfo": {
|
||||
"py_callstack": traceback.format_exc(),
|
||||
"py_callstack": traceback.format_stack(),
|
||||
"timestamp": str(int(time.time())),
|
||||
},
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user