[torch/elastic] Pretty print the failure message captured by @record (#64036)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64036

This PR slightly revises the implementation of the internal `_format_failure()` method in order to pretty print the error message captured in a subprocess by the `record` annotation.

With this PR a failure log is formatted as below:

```
Root Cause:
[0]:
  time: 2021-08-26_17:12:07
  rank: 0 (local_rank: 0)
  exitcode: 1 (pid: 8045)
  error_file: /tmp/torchelastic_6cj9eppm/6d9d844a-6ce4-4838-93ed-1639a9525b00_rec9kuv3/attempt_0/0/error.json
  msg:
    {
      "message": "ValueError: Test",
      "extraInfo": {
        "py_callstack": [
          "  File \"/data/home/balioglu/fail.py\", line 7, in <module>\n    main()\n",
          "  File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 373, in wrapper\n    error_handler.record_exception(e)\n",
          "  File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 86, in record_exception\n    _write_error(e, self._get_error_file_path())\n",
          "  File \"/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/error_handler.py\", line 26, in _write_error\n    \"py_callstack\": traceback.format_stack(),\n"
        ],
        "timestamp": "1629997927"
      }
    }
```

in contrast to the old formatting:

```
Root Cause:
[0]:
  time: 2021-08-26_17:15:50
  rank: 0 (local_rank: 0)
  exitcode: 1 (pid: 9417)
  error_file: /tmp/torchelastic_22pwarnq/19f22638-848c-4b8f-8379-677f34fc44e7_u43o9vs7/attempt_0/0/error.json
  msg: "{'message': 'ValueError: Test', 'extraInfo': {'py_callstack': 'Traceback (most recent call last):\n  File "/fsx/users/balioglu/repos/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 351, in wrapper\n    return f(*args, **kwargs)\n  File "/data/home/balioglu/fail.py", line 5, in main\n    raise ValueError("BALIOGLU")\nValueError: BALIOGLU\n', 'timestamp': '1629998150'}}"
```
ghstack-source-id: 136761768

Test Plan: Run the existing unit tests.

Reviewed By: kiukchung

Differential Revision: D30579025

fbshipit-source-id: 37df0b7c7ec9b620355766122986c2c77e8495ae
This commit is contained in:
Can Balioglu 2021-08-26 13:55:08 -07:00 committed by Facebook GitHub Bot
parent 5a12cb611f
commit d8d8e4902a
2 changed files with 16 additions and 3 deletions

View File

@ -165,7 +165,7 @@ _FAILURE_FORMAT_TEMPLATE = """[${idx}]:
rank: ${rank} (local_rank: ${local_rank})
exitcode: ${exitcode} (pid: ${pid})
error_file: ${error_file}
msg: \"${message}\""""
msg: ${message}"""
# extra new lines before and after are intentional
_MSG_FORMAT_TEMPLATE = """
@ -258,6 +258,19 @@ class ChildFailedError(Exception):
def _format_failure(
self, idx: int, rank: int, failure: ProcessFailure
) -> Tuple[str, int]:
if isinstance(failure.message, str):
msg = '"' + failure.message + '"'
else:
try:
dmp = json.dumps(failure.message, indent=2)
except ValueError:
msg = failure.message
else:
msg = os.linesep
# Indent by 4 chars.
for l in dmp.splitlines():
msg += f" {l}{os.linesep}"
fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
idx=idx,
time=failure.timestamp_isoformat(),
@ -266,7 +279,7 @@ class ChildFailedError(Exception):
exitcode=failure.exitcode,
pid=failure.pid,
error_file=failure.error_file,
message=failure.message,
message=msg,
)
width = 0
for line in fmt.split("\n"):

View File

@ -23,7 +23,7 @@ def _write_error(e: BaseException, error_file: Optional[str]):
"message": {
"message": f"{type(e).__name__}: {e}",
"extraInfo": {
"py_callstack": traceback.format_exc(),
"py_callstack": traceback.format_stack(),
"timestamp": str(int(time.time())),
},
}