upload test stats: remove nan/inf when uploading (#136877)

`json.dumps(float("inf"))` returns `Infinity`, which is technically invalid json This is fine if you json.load, but ClickHouse cannot handle it Solution here: cast inf and nan to string (which ClickHouse is able to cast back to float) Pull Request resolved: https://github.com/pytorch/pytorch/pull/136877 Approved by: https://github.com/huydhn
2025-12-06 00:20:18 +01:00 · 2024-10-01 21:47:46 +00:00 · 2024-10-01 21:47:46 +00:00 · 6baee60e3c
commit 6baee60e3c
parent 0788d016d6
3 changed files with 50 additions and 4 deletions
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@ -3,6 +3,7 @@ from __future__ import annotations
 import gzip
 import io
 import json
+import math
 import os
 import time
 import zipfile
@ -199,6 +200,23 @@ def read_from_s3(
    return [json.loads(result) for result in results if result]


+def remove_nan_inf(old: Any) -> Any:
+    # Casta NaN, inf, -inf to string from float since json.dumps outputs invalid
+    # json with them
+    def _helper(o: Any) -> Any:
+        if isinstance(o, float) and (math.isinf(o) or math.isnan(o)):
+            return str(o)
+        if isinstance(o, list):
+            return [_helper(v) for v in o]
+        if isinstance(o, dict):
+            return {_helper(k): _helper(v) for k, v in o.items()}
+        if isinstance(o, tuple):
+            return tuple(_helper(v) for v in o)
+        return o
+
+    return _helper(old)
+
+
 def upload_workflow_stats_to_s3(
    workflow_run_id: int,
    workflow_run_attempt: int,
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@ -13,6 +13,7 @@ from tools.stats.test_dashboard import upload_additional_info
 from tools.stats.upload_stats_lib import (
    download_s3_artifacts,
    get_job_id,
+    remove_nan_inf,
    unzip,
    upload_workflow_stats_to_s3,
 )
@ -266,7 +267,7 @@ if __name__ == "__main__":
        args.workflow_run_id,
        args.workflow_run_attempt,
        "test_run_summary",
-        test_case_summary,
+        remove_nan_inf(test_case_summary),
    )

    # Separate out the failed test cases.
@ -281,13 +282,16 @@ if __name__ == "__main__":
        args.workflow_run_id,
        args.workflow_run_attempt,
        "failed_test_runs",
-        failed_tests_cases,
+        remove_nan_inf(failed_tests_cases),
    )

    if args.head_branch == "main" and args.head_repository == "pytorch/pytorch":
        # For jobs on main branch, upload everything.
        upload_workflow_stats_to_s3(
-            args.workflow_run_id, args.workflow_run_attempt, "test_run", test_cases
+            args.workflow_run_id,
+            args.workflow_run_attempt,
+            "test_run",
+            remove_nan_inf(test_cases),
        )

    upload_additional_info(args.workflow_run_id, args.workflow_run_attempt, test_cases)
--- a/tools/test/test_upload_stats_lib.py
+++ b/tools/test/test_upload_stats_lib.py
@ -2,6 +2,7 @@ from __future__ import annotations

 import decimal
 import inspect
+import json
 import sys
 import unittest
 from pathlib import Path
@ -13,7 +14,7 @@ REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 sys.path.insert(0, str(REPO_ROOT))

 from tools.stats.upload_metrics import add_global_metric, emit_metric
-from tools.stats.upload_stats_lib import BATCH_SIZE, upload_to_rockset
+from tools.stats.upload_stats_lib import BATCH_SIZE, remove_nan_inf, upload_to_rockset


 sys.path.remove(str(REPO_ROOT))
@ -335,6 +336,29 @@ class TestUploadStats(unittest.TestCase):
                expected_number_of_requests,
            )

+    def test_remove_nan_inf(self) -> None:
+        checks = [
+            (float("inf"), '"inf"', "Infinity"),
+            (float("nan"), '"nan"', "NaN"),
+            ({1: float("inf")}, '{"1": "inf"}', '{"1": Infinity}'),
+            ([float("nan")], '["nan"]', "[NaN]"),
+            ({1: [float("nan")]}, '{"1": ["nan"]}', '{"1": [NaN]}'),
+        ]
+
+        for input, clean, unclean in checks:
+            clean_output = json.dumps(remove_nan_inf(input))
+            unclean_output = json.dumps(input)
+            self.assertEqual(
+                clean_output,
+                clean,
+                f"Expected {clean} when input is {unclean}, got {clean_output}",
+            )
+            self.assertEqual(
+                unclean_output,
+                unclean,
+                f"Expected {unclean} when input is {unclean}, got {unclean_output}",
+            )
+

 if __name__ == "__main__":
    unittest.main()