Store test file in S3 as well for every TestSuite (#52869)

Summary: We want to store the file names that triggers each test suite so that we can use this data for categorizing those test files. ~~After considering several solutions, this one is the most backwards compatible, and the current test cases in test_testing.py for print test stats don't break.~~ The previous plan did not work, as there are multiple Python test jobs that spawn the same suites. Instead, the new S3 format will store test files (e.g., `test_nn` and `distributed/test_distributed_fork`) which will contain the suites they spawn, which will contain the test cases run within the suite. (Currently, there is no top layer of test files.) Because of this major structural change, a lot of changes have now been made (thank you samestep!) to test_history.py and print_test_stats.py to make this new format backwards compatible. Old test plan: Make sure that the data is as expected in S3 after https://github.com/pytorch/pytorch/pull/52873 finishes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/52869 Test Plan: Added tests to test_testing.py which pass, and CI. Reviewed By: samestep Differential Revision: D26672561 Pulled By: janeyx99 fbshipit-source-id: f46b91e16c1d9de5e0cb9bfa648b6448d979257e
2025-12-06 12:20:52 +01:00 · 2021-03-02 07:33:57 -08:00 · 2021-03-02 07:33:57 -08:00 · 09ce9b5877
commit 09ce9b5877
parent 931100f829
5 changed files with 579 additions and 181 deletions
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -172,19 +172,24 @@ test_libtorch() {
    # Start background download
    python tools/download_mnist.py --quiet -d test/cpp/api/mnist &

+    # Make test_reports directory
+    # NB: the ending test_libtorch must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
+    mkdir -p $TEST_REPORTS_DIR
+
    # Run JIT cpp tests
-    mkdir -p test/test-reports/cpp-unittest
    python test/cpp/jit/tests_setup.py setup
    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-      build/bin/test_jit  --gtest_output=xml:test/test-reports/cpp-unittest/test_jit.xml
+      build/bin/test_jit  --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
    else
-      build/bin/test_jit  --gtest_filter='-*CUDA' --gtest_output=xml:test/test-reports/cpp-unittest/test_jit.xml
+      build/bin/test_jit  --gtest_filter='-*CUDA' --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
    fi
    python test/cpp/jit/tests_setup.py shutdown
    # Wait for background download to finish
    wait
-    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api --gtest_output=xml:test/test-reports/cpp-unittest/test_api.xml
-    build/bin/test_tensorexpr --gtest_output=xml:test/test-reports/cpp-unittests/test_tensorexpr.xml
+    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
+    build/bin/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
    assert_git_not_dirty
  fi
 }
@ -192,30 +197,39 @@ test_libtorch() {
 test_vulkan() {
  if [[ "$BUILD_ENVIRONMENT" == *vulkan-linux* ]]; then
    export VK_ICD_FILENAMES=/var/lib/jenkins/swiftshader/build/Linux/vk_swiftshader_icd.json
-    mkdir -p test/test-reports/cpp-vulkan
-    build/bin/vulkan_test --gtest_output=xml:test/test-reports/cpp-vulkan/vulkan_test.xml
+    # NB: the ending test_vulkan must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-vulkan/test_vulkan
+    mkdir -p $TEST_REPORTS_DIR
+    build/bin/vulkan_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml
  fi
 }

 test_distributed() {
  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
    echo "Testing distributed C++ tests"
-    mkdir -p test/test-reports/cpp-distributed
-    build/bin/FileStoreTest --gtest_output=xml:test/test-reports/cpp-distributed/FileStoreTest.xml
-    build/bin/HashStoreTest --gtest_output=xml:test/test-reports/cpp-distributed/HashStoreTest.xml
-    build/bin/TCPStoreTest --gtest_output=xml:test/test-reports/cpp-distributed/TCPStoreTest.xml
+    # NB: the ending test_distributed must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-distributed/test_distributed
+    mkdir -p $TEST_REPORTS_DIR
+    build/bin/FileStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/FileStoreTest.xml
+    build/bin/HashStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/HashStoreTest.xml
+    build/bin/TCPStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/TCPStoreTest.xml

-    build/bin/ProcessGroupGlooTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupGlooTest.xml
-    build/bin/ProcessGroupNCCLTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupNCCLTest.xml
-    build/bin/ProcessGroupNCCLErrorsTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupNCCLErrorsTest.xml
+    build/bin/ProcessGroupGlooTest --gtest_output=xml:$TEST_REPORTS_DIR/ProcessGroupGlooTest.xml
+    build/bin/ProcessGroupNCCLTest --gtest_output=xml:$TEST_REPORTS_DIR/ProcessGroupNCCLTest.xml
+    build/bin/ProcessGroupNCCLErrorsTest --gtest_output=xml:$TEST_REPORTS_DIR/ProcessGroupNCCLErrorsTest.xml
  fi
 }

 test_rpc() {
  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
    echo "Testing RPC C++ tests"
-    mkdir -p test/test-reports/cpp-rpc
-    build/bin/test_cpp_rpc --gtest_output=xml:test/test-reports/cpp-rpc/test_cpp_rpc.xml
+    # NB: the ending test_rpc must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-rpc/test_rpc
+    mkdir -p $TEST_REPORTS_DIR
+    build/bin/test_cpp_rpc --gtest_output=xml:$TEST_REPORTS_DIR/test_cpp_rpc.xml
  fi
 }

--- a/test/test_testing.py
+++ b/test/test_testing.py
@ -649,6 +649,17 @@ def fakehash(char):
    return char * 40


+def dummy_meta_meta() -> print_test_stats.ReportMetaMeta:
+    return {
+        'build_pr': '',
+        'build_tag': '',
+        'build_sha1': '',
+        'build_branch': '',
+        'build_job': '',
+        'build_workflow_id': '',
+    }
+
+
 def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
    return {
        'name': name,
@ -659,7 +670,7 @@ def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
    }


-def makereport(tests):
+def make_report_v1(tests) -> print_test_stats.Version1Report:
    suites = {
        suite_name: {
            'total_seconds': sum(case['seconds'] for case in cases),
@ -668,59 +679,201 @@ def makereport(tests):
        for suite_name, cases in tests.items()
    }
    return {
+        **dummy_meta_meta(),
        'total_seconds': sum(s['total_seconds'] for s in suites.values()),
        'suites': suites,
    }


+def make_case_v2(seconds, status=None) -> print_test_stats.Version2Case:
+    return {
+        'seconds': seconds,
+        'status': status,
+    }
+
+
+def make_report_v2(tests) -> print_test_stats.Version2Report:
+    files = {}
+    for file_name, file_suites in tests.items():
+        suites = {
+            suite_name: {
+                'total_seconds': sum(case['seconds'] for case in cases.values()),
+                'cases': cases,
+            }
+            for suite_name, cases in file_suites.items()
+        }
+        files[file_name] = {
+            'suites': suites,
+            'total_seconds': sum(suite['total_seconds'] for suite in suites.values()),
+        }
+    return {
+        **dummy_meta_meta(),
+        'format_version': 2,
+        'total_seconds': sum(s['total_seconds'] for s in files.values()),
+        'files': files,
+    }
+
+
 class TestPrintTestStats(TestCase):
    maxDiff = None

-    def test_analysis(self):
-        head_report = makereport({
-            # input ordering of the suites is ignored
-            'Grault': [
-                # not printed: status same and time similar
-                makecase('test_grault0', 4.78, failed=True),
-                # status same, but time increased a lot
-                makecase('test_grault2', 1.473, errored=True),
-            ],
-            # individual tests times changed, not overall suite
-            'Qux': [
-                # input ordering of the test cases is ignored
-                makecase('test_qux1', 0.001, skipped=True),
-                makecase('test_qux6', 0.002, skipped=True),
-                # time in bounds, but status changed
-                makecase('test_qux4', 7.158, failed=True),
-                # not printed because it's the same as before
-                makecase('test_qux7', 0.003, skipped=True),
-                makecase('test_qux5', 11.968),
-                makecase('test_qux3', 23.496),
-            ],
-            # new test suite
-            'Bar': [
-                makecase('test_bar2', 3.742, failed=True),
-                makecase('test_bar1', 50.447),
-            ],
-            # overall suite time changed but no individual tests
-            'Norf': [
-                makecase('test_norf1', 3),
-                makecase('test_norf2', 3),
-                makecase('test_norf3', 3),
-                makecase('test_norf4', 3),
-            ],
-            # suite doesn't show up if it doesn't change enough
-            'Foo': [
-                makecase('test_foo1', 42),
-                makecase('test_foo2', 56),
-            ],
+    version1_report: print_test_stats.Version1Report = make_report_v1({
+        # input ordering of the suites is ignored
+        'Grault': [
+            # not printed: status same and time similar
+            makecase('test_grault0', 4.78, failed=True),
+            # status same, but time increased a lot
+            makecase('test_grault2', 1.473, errored=True),
+        ],
+        # individual tests times changed, not overall suite
+        'Qux': [
+            # input ordering of the test cases is ignored
+            makecase('test_qux1', 0.001, skipped=True),
+            makecase('test_qux6', 0.002, skipped=True),
+            # time in bounds, but status changed
+            makecase('test_qux4', 7.158, failed=True),
+            # not printed because it's the same as before
+            makecase('test_qux7', 0.003, skipped=True),
+            makecase('test_qux5', 11.968),
+            makecase('test_qux3', 23.496),
+        ],
+        # new test suite
+        'Bar': [
+            makecase('test_bar2', 3.742, failed=True),
+            makecase('test_bar1', 50.447),
+        ],
+        # overall suite time changed but no individual tests
+        'Norf': [
+            makecase('test_norf1', 3),
+            makecase('test_norf2', 3),
+            makecase('test_norf3', 3),
+            makecase('test_norf4', 3),
+        ],
+        # suite doesn't show up if it doesn't change enough
+        'Foo': [
+            makecase('test_foo1', 42),
+            makecase('test_foo2', 56),
+        ],
+    })
+
+    version2_report: print_test_stats.Version2Report = make_report_v2(
+        {
+            'test_a': {
+                'Grault': {
+                    'test_grault0': make_case_v2(4.78, 'failed'),
+                    'test_grault2': make_case_v2(1.473, 'errored'),
+                },
+                'Qux': {
+                    'test_qux1': make_case_v2(0.001, 'skipped'),
+                    'test_qux6': make_case_v2(0.002, 'skipped'),
+                    'test_qux4': make_case_v2(7.158, 'failed'),
+                    'test_qux7': make_case_v2(0.003, 'skipped'),
+                    'test_qux8': make_case_v2(11.968),
+                    'test_qux3': make_case_v2(23.496),
+                }
+            },
+            'test_b': {
+                'Bar': {
+                    'test_bar2': make_case_v2(3.742, 'failed'),
+                    'test_bar1': make_case_v2(50.447),
+                },
+                # overall suite time changed but no individual tests
+                'Norf': {
+                    'test_norf1': make_case_v2(3),
+                    'test_norf2': make_case_v2(3),
+                    'test_norf3': make_case_v2(3),
+                    'test_norf4': make_case_v2(3),
+                },
+            },
+            'test_c': {
+                'Foo': {
+                    'test_foo1': make_case_v2(42),
+                    'test_foo2': make_case_v2(56),
+                },
+            }
        })

+    def test_simplify(self):
+        self.assertEqual(
+            {
+                '': {
+                    'Bar': {
+                        'test_bar1': {'seconds': 50.447, 'status': None},
+                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                    },
+                    'Foo': {
+                        'test_foo1': {'seconds': 42, 'status': None},
+                        'test_foo2': {'seconds': 56, 'status': None},
+                    },
+                    'Grault': {
+                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
+                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    },
+                    'Norf': {
+                        'test_norf1': {'seconds': 3, 'status': None},
+                        'test_norf3': {'seconds': 3, 'status': None},
+                        'test_norf2': {'seconds': 3, 'status': None},
+                        'test_norf4': {'seconds': 3, 'status': None},
+                    },
+                    'Qux': {
+                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
+                        'test_qux3': {'seconds': 23.496, 'status': None},
+                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
+                        'test_qux5': {'seconds': 11.968, 'status': None},
+                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
+                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                    },
+                },
+            },
+            print_test_stats.simplify(self.version1_report)
+        )
+
+        self.assertEqual(
+            {
+                'test_a': {
+                    'Grault': {
+                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
+                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    },
+                    'Qux': {
+                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
+                        'test_qux3': {'seconds': 23.496, 'status': None},
+                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
+                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
+                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                        'test_qux8': {'seconds': 11.968, 'status': None},
+                    },
+                },
+                'test_b': {
+                    'Bar': {
+                        'test_bar1': {'seconds': 50.447, 'status': None},
+                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                    },
+                    'Norf': {
+                        'test_norf1': {'seconds': 3, 'status': None},
+                        'test_norf2': {'seconds': 3, 'status': None},
+                        'test_norf3': {'seconds': 3, 'status': None},
+                        'test_norf4': {'seconds': 3, 'status': None},
+                    },
+                },
+                'test_c': {
+                    'Foo': {
+                        'test_foo1': {'seconds': 42, 'status': None},
+                        'test_foo2': {'seconds': 56, 'status': None},
+                    },
+                },
+            },
+            print_test_stats.simplify(self.version2_report),
+        )
+
+    def test_analysis(self):
+        head_report = self.version1_report
+
        base_reports = {
            # bbbb has no reports, so base is cccc instead
            fakehash('b'): [],
            fakehash('c'): [
-                makereport({
+                make_report_v1({
                    'Baz': [
                        makecase('test_baz2', 13.605),
                        # no recent suites have & skip this test
@ -753,7 +906,7 @@ class TestPrintTestStats(TestCase):
                }),
            ],
            fakehash('d'): [
-                makereport({
+                make_report_v1({
                    'Foo': [
                        makecase('test_foo1', 40),
                        # removed in cccc
@ -783,7 +936,7 @@ class TestPrintTestStats(TestCase):
            ],
            fakehash('e'): [],
            fakehash('f'): [
-                makereport({
+                make_report_v1({
                    'Foo': [
                        makecase('test_foo3', 24),
                        makecase('test_foo1', 43),
@ -1066,14 +1219,14 @@ Added    (across    1 suite)      1 test,  totaling +   3.00s
 ''',
            print_test_stats.regression_info(
                head_sha=fakehash('a'),
-                head_report=makereport({
+                head_report=make_report_v1({
                    'Foo': [
                        makecase('test_foo', 0.02, skipped=True),
                        makecase('test_baz', 3),
                    ]}),
                base_reports={
                    fakehash('b'): [
-                        makereport({
+                        make_report_v1({
                            'Foo': [
                                makecase('test_foo', 40),
                                makecase('test_bar', 1),
@ -1081,7 +1234,7 @@ Added    (across    1 suite)      1 test,  totaling +   3.00s
                        }),
                    ],
                    fakehash('c'): [
-                        makereport({
+                        make_report_v1({
                            'Foo': [
                                makecase('test_foo', 43),
                            ],
@ -1135,7 +1288,7 @@ Added    (across    1 suite)      2 tests, totaling +   3.02s
 ''',
            print_test_stats.regression_info(
                head_sha=fakehash('a'),
-                head_report=makereport({
+                head_report=make_report_v1({
                    'Foo': [
                        makecase('test_foo', 0.02, skipped=True),
                        makecase('test_baz', 3),
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -289,7 +289,7 @@ class TestCheckpoint(TestCase):
            out = checkpoint(run_fn2, input_var, input_var2)
            out.sum().backward()

-class TestDataLoader(TestCase):
+class TestDataLoaderUtils(TestCase):
    def setUp(self):
        self.dataset = torch.randn(5, 3, 3, 2)
        self.batch_size = 3
--- a/tools/test_history.py
+++ b/tools/test_history.py
@ -6,11 +6,11 @@ import json
 import subprocess
 from collections import defaultdict
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast

 import boto3  # type: ignore[import]
 import botocore  # type: ignore[import]
-from typing_extensions import TypedDict
+from typing_extensions import Literal, TypedDict


 def get_git_commit_history(
@ -36,31 +36,70 @@ def get_object_summaries(*, bucket: Any, sha: str) -> Dict[str, List[Any]]:
    return dict(by_job)


-class Case(TypedDict):
-    name: str
+# TODO: consolidate these typedefs with the identical ones in
+# torch/testing/_internal/print_test_stats.py
+
+Commit = str  # 40-digit SHA-1 hex string
+Status = Optional[Literal['errored', 'failed', 'skipped']]
+
+
+class CaseMeta(TypedDict):
    seconds: float
+
+
+class Version1Case(CaseMeta):
+    name: str
    errored: bool
    failed: bool
    skipped: bool


-class Suite(TypedDict):
+class Version1Suite(TypedDict):
    total_seconds: float
-    cases: List[Case]
+    cases: List[Version1Case]


-class ReportMeta(TypedDict):
+class ReportMetaMeta(TypedDict):
    build_pr: str
    build_tag: str
-    build_sha1: str
+    build_sha1: Commit
    build_branch: str
    build_job: str
    build_workflow_id: str


-class Report(ReportMeta):
+class ReportMeta(ReportMetaMeta):
    total_seconds: float
-    suites: Dict[str, Suite]
+
+
+class Version1Report(ReportMeta):
+    suites: Dict[str, Version1Suite]
+
+
+class Version2Case(CaseMeta):
+    status: Status
+
+
+class Version2Suite(TypedDict):
+    total_seconds: float
+    cases: Dict[str, Version2Case]
+
+
+class Version2File(TypedDict):
+    total_seconds: float
+    suites: Dict[str, Version2Suite]
+
+
+class VersionedReport(ReportMeta):
+    format_version: int
+
+
+# report: Version2Report implies report['format_version'] == 2
+class Version2Report(VersionedReport):
+    files: Dict[str, Version2File]
+
+
+Report = Union[Version1Report, VersionedReport]


 def get_jsons(
@ -77,32 +116,63 @@ def get_jsons(
    }


+# TODO: consolidate this with the case_status function from
+# torch/testing/_internal/print_test_stats.py
+def case_status(case: Version1Case) -> Status:
+    for k in {'errored', 'failed', 'skipped'}:
+        if case[k]:  # type: ignore[misc]
+            return cast(Status, k)
+    return None
+
+
+# TODO: consolidate this with the newify_case function from
+# torch/testing/_internal/print_test_stats.py
+def newify_case(case: Version1Case) -> Version2Case:
+    return {
+        'seconds': case['seconds'],
+        'status': case_status(case),
+    }
+
+
+# TODO: consolidate this with the simplify function from
+# torch/testing/_internal/print_test_stats.py
 def get_cases(
    *,
    data: Report,
+    filename: Optional[str],
    suite_name: Optional[str],
    test_name: str,
-) -> List[Case]:
-    cases = []
-    suites = data['suites']
-    for name, suite in suites.items():
-        if name == suite_name or not suite_name:
-            for case in suite['cases']:
-                if case['name'] == test_name:
-                    cases.append(case)
+) -> List[Version2Case]:
+    cases: List[Version2Case] = []
+    if 'format_version' not in data:  # version 1 implicitly
+        v1report = cast(Version1Report, data)
+        suites = v1report['suites']
+        for sname, v1suite in suites.items():
+            if sname == suite_name or not suite_name:
+                for v1case in v1suite['cases']:
+                    if v1case['name'] == test_name:
+                        cases.append(newify_case(v1case))
+    else:
+        v_report = cast(VersionedReport, data)
+        version = v_report['format_version']
+        if version == 2:
+            v2report = cast(Version2Report, v_report)
+            for fname, v2file in v2report['files'].items():
+                if fname == filename or not filename:
+                    for sname, v2suite in v2file['suites'].items():
+                        if sname == suite_name or not suite_name:
+                            v2case = v2suite['cases'].get(test_name)
+                            if v2case:
+                                cases.append(v2case)
+        else:
+            raise RuntimeError(f'Unknown format version: {version}')
    return cases


-def case_status(case: Case) -> Optional[str]:
-    for k in {'errored', 'failed', 'skipped'}:
-        if case[k]:  # type: ignore[misc]
-            return k
-    return None
-
-
 def make_column(
    *,
    data: Optional[Report],
+    filename: Optional[str],
    suite_name: Optional[str],
    test_name: str,
    digits: int,
@ -112,12 +182,13 @@ def make_column(
    if data:
        cases = get_cases(
            data=data,
+            filename=filename,
            suite_name=suite_name,
            test_name=test_name
        )
        if cases:
            case = cases[0]
-            status = case_status(case)
+            status = case['status']
            omitted = len(cases) - 1
            if status:
                return f'{status.rjust(num_length)} ', omitted
@ -134,6 +205,7 @@ def make_columns(
    jobs: List[str],
    jsons: Dict[str, Report],
    omitted: Dict[str, int],
+    filename: Optional[str],
    suite_name: Optional[str],
    test_name: str,
    digits: int,
@ -145,6 +217,7 @@ def make_columns(
        data = jsons.get(job)
        column, omitted_suites = make_column(
            data=data,
+            filename=filename,
            suite_name=suite_name,
            test_name=test_name,
            digits=digits,
@ -165,6 +238,7 @@ def make_lines(
    jobs: Set[str],
    jsons: Dict[str, Report],
    omitted: Dict[str, int],
+    filename: Optional[str],
    suite_name: Optional[str],
    test_name: str,
 ) -> List[str]:
@ -172,12 +246,13 @@ def make_lines(
    for job, data in jsons.items():
        cases = get_cases(
            data=data,
+            filename=filename,
            suite_name=suite_name,
            test_name=test_name,
        )
        if cases:
            case = cases[0]
-            status = case_status(case)
+            status = case['status']
            line = f'{job} {case["seconds"]}s{f" {status}" if status else ""}'
            if job in omitted and omitted[job] > 0:
                line += f' ({omitted[job]} S3 reports omitted)'
@ -197,6 +272,7 @@ def display_history(
    bucket: Any,
    commits: List[Tuple[str, datetime]],
    jobs: Optional[List[str]],
+    filename: Optional[str],
    suite_name: Optional[str],
    test_name: str,
    delta: int,
@ -226,6 +302,7 @@ def display_history(
                jobs=jobs,
                jsons=jsons,
                omitted=omitted,
+                filename=filename,
                suite_name=suite_name,
                test_name=test_name,
                digits=digits,
@ -236,6 +313,7 @@ def display_history(
                jobs=set(jobs or []),
                jsons=jsons,
                omitted=omitted,
+                filename=filename,
                suite_name=suite_name,
                test_name=test_name,
            )
@ -352,6 +430,10 @@ indicated test was not found in that report.
        action='store_true',
        help='(multiline) ignore listed jobs, show all jobs for each commit',
    )
+    parser.add_argument(
+        '--file',
+        help='name of the file containing the test',
+    )
    parser.add_argument(
        '--suite',
        help='name of the suite containing the test',
@ -381,6 +463,7 @@ indicated test was not found in that report.
        bucket=bucket,
        commits=commits,
        jobs=jobs,
+        filename=args.file,
        suite_name=args.suite,
        test_name=args.test,
        delta=args.delta,
--- a/torch/testing/_internal/print_test_stats.py
+++ b/torch/testing/_internal/print_test_stats.py
@ -5,6 +5,7 @@ import datetime
 import json
 import math
 import os
+import re
 import statistics
 import subprocess
 import time
@ -12,11 +13,11 @@ from collections import defaultdict
 from glob import glob
 from pathlib import Path
 from typing import (Any, DefaultDict, Dict, Iterable, Iterator, List, Optional,
-                    Tuple)
+                    Set, Tuple, Union, cast)
 from xml.dom import minidom  # type: ignore[import]

 import requests
-from typing_extensions import TypedDict
+from typing_extensions import Literal, TypedDict

 try:
    import boto3  # type: ignore[import]
@ -24,29 +25,30 @@ try:
 except ImportError:
    HAVE_BOTO3 = False

+# TODO: consolidate these typedefs with the identical ones in
+# tools/test_history.py
+
 Commit = str  # 40-digit SHA-1 hex string
-Status = Optional[str]  # errored, failed, skipped, or None
-
-# represent suite as dict because indexing is useful
-SimplerCase = Tuple[float, Status]
-SimplerSuite = Dict[str, SimplerCase]
-SimplerReport = Dict[str, SimplerSuite]
+Status = Optional[Literal['errored', 'failed', 'skipped']]


-class Case(TypedDict):
-    name: str
+class CaseMeta(TypedDict):
    seconds: float
+
+
+class Version1Case(CaseMeta):
+    name: str
    errored: bool
    failed: bool
    skipped: bool


-class Suite(TypedDict):
+class Version1Suite(TypedDict):
    total_seconds: float
-    cases: List[Case]
+    cases: List[Version1Case]


-class ReportMeta(TypedDict, total=False):
+class ReportMetaMeta(TypedDict):
    build_pr: str
    build_tag: str
    build_sha1: Commit
@ -55,9 +57,42 @@ class ReportMeta(TypedDict, total=False):
    build_workflow_id: str


-class Report(ReportMeta):
+class ReportMeta(ReportMetaMeta):
    total_seconds: float
-    suites: Dict[str, Suite]
+
+
+class Version1Report(ReportMeta):
+    suites: Dict[str, Version1Suite]
+
+
+class Version2Case(CaseMeta):
+    status: Status
+
+
+class Version2Suite(TypedDict):
+    total_seconds: float
+    cases: Dict[str, Version2Case]
+
+
+class Version2File(TypedDict):
+    total_seconds: float
+    suites: Dict[str, Version2Suite]
+
+
+class VersionedReport(ReportMeta):
+    format_version: int
+
+
+# report: Version2Report implies report['format_version'] == 2
+class Version2Report(VersionedReport):
+    files: Dict[str, Version2File]
+
+
+Report = Union[Version1Report, VersionedReport]
+
+SimplerSuite = Dict[str, Version2Case]
+SimplerFile = Dict[str, SimplerSuite]
+SimplerReport = Dict[str, SimplerFile]


 class Stat(TypedDict):
@ -69,7 +104,7 @@ class CaseDiff(TypedDict):
    margin: str
    name: str
    was: Optional[Tuple[Stat, Status]]
-    now: Optional[SimplerCase]
+    now: Optional[Version2Case]


 class SuiteDiff(TypedDict):
@ -80,23 +115,78 @@ class SuiteDiff(TypedDict):
    cases: List[CaseDiff]


-def case_status(case: Case) -> Status:
+# TODO: consolidate this with the case_status function from
+# tools/test_history.py
+def case_status(case: Version1Case) -> Status:
    for k in {'errored', 'failed', 'skipped'}:
        if case[k]:  # type: ignore[misc]
-            return k
+            return cast(Status, k)
    return None


-def simplify(report: Report) -> SimplerReport:
+# TODO: consolidate this with the newify_case function from
+# tools/test_history.py
+def newify_case(case: Version1Case) -> Version2Case:
    return {
-        suite_name: {
-            case['name']: (case['seconds'], case_status(case))
-            for case in suite['cases']
-        }
-        for suite_name, suite in report['suites'].items()
+        'seconds': case['seconds'],
+        'status': case_status(case),
    }


+# TODO: consolidate this with the get_cases function from
+# tools/test_history.py
+
+# Here we translate to a three-layer format (file -> suite -> case)
+# rather than a two-layer format (suite -> case) because as mentioned in
+# a comment in the body of this function, if we consolidate suites that
+# share a name, there will be test case name collisions, and once we
+# have those, there's no clean way to deal with it in the diffing logic.
+# It's not great to have to add a dummy empty string for the filename
+# for version 1 reports, but it's better than either losing cases that
+# share a name (for version 2 reports) or using a list of cases rather
+# than a dict.
+def simplify(report: Report) -> SimplerReport:
+    if 'format_version' not in report:  # version 1 implicitly
+        v1report = cast(Version1Report, report)
+        return {
+            # we just don't have test filename information sadly, so we
+            # just make one fake filename that is the empty string
+            '': {
+                suite_name: {
+                    # This clobbers some cases that have duplicate names
+                    # because in version 1, we would merge together all
+                    # the suites with a given name (even if they came
+                    # from different files), so there were actually
+                    # situations in which two cases in the same suite
+                    # shared a name (because they actually originally
+                    # came from two suites that were then merged). It
+                    # would probably be better to warn about the cases
+                    # that we're silently discarding here, but since
+                    # we're only uploading in the new format (where
+                    # everything is also keyed by filename) going
+                    # forward, it shouldn't matter too much.
+                    case['name']: newify_case(case)
+                    for case in suite['cases']
+                }
+                for suite_name, suite in v1report['suites'].items()
+            }
+        }
+    else:
+        v_report = cast(VersionedReport, report)
+        version = v_report['format_version']
+        if version == 2:
+            v2report = cast(Version2Report, v_report)
+            return {
+                filename: {
+                    suite_name: suite['cases']
+                    for suite_name, suite in file_data['suites'].items()
+                }
+                for filename, file_data in v2report['files'].items()
+            }
+        else:
+            raise RuntimeError(f'Unknown format version: {version}')
+
+
 def plural(n: int) -> str:
    return '' if n == 1 else 's'

@ -165,7 +255,9 @@ def unlines(lines: List[str]) -> str:


 def matching_test_times(
+    *,
    base_reports: Dict[Commit, List[SimplerReport]],
+    filename: str,
    suite_name: str,
    case_name: str,
    status: Status,
@ -173,13 +265,16 @@ def matching_test_times(
    times: List[float] = []
    for reports in base_reports.values():
        for report in reports:
-            suite = report.get(suite_name)
-            if suite:
-                case = suite.get(case_name)
-                if case:
-                    t, s = case
-                    if s == status:
-                        times.append(t)
+            file_data = report.get(filename)
+            if file_data:
+                suite = file_data.get(suite_name)
+                if suite:
+                    case = suite.get(case_name)
+                    if case:
+                        t = case['seconds']
+                        s = case['status']
+                        if s == status:
+                            times.append(t)
    return times


@ -195,30 +290,43 @@ def analyze(

    # find all relevant suites (those in either base or head or both)
    all_reports = [head_report] + base_report
-    all_suites = {k for r in all_reports for k in r.keys()}
+    all_suites: Set[Tuple[str, str]] = {
+        (filename, suite_name)
+        for r in all_reports
+        for filename, file_data in r.items()
+        for suite_name in file_data.keys()
+    }

    removed_suites: List[SuiteDiff] = []
    modified_suites: List[SuiteDiff] = []
    added_suites: List[SuiteDiff] = []

-    for suite_name in sorted(all_suites):
+    for filename, suite_name in sorted(all_suites):
        case_diffs: List[CaseDiff] = []
-        head_suite = head_report.get(suite_name)
+        head_suite = head_report.get(filename, {}).get(suite_name)
        base_cases: Dict[str, Status] = dict(sorted(set.intersection(*[
-            {(n, s) for n, (_, s) in report.get(suite_name, {}).items()}
+            {
+                (n, case['status'])
+                for n, case
+                in report.get(filename, {}).get(suite_name, {}).items()
+            }
            for report in base_report
        ] or [set()])))
        case_stats: Dict[str, Stat] = {}
        if head_suite:
-            now = sum(case[0] for case in head_suite.values())
-            if any(suite_name in report for report in base_report):
+            now = sum(case['seconds'] for case in head_suite.values())
+            if any(
+                filename in report and suite_name in report[filename]
+                for report in base_report
+            ):
                removed_cases: List[CaseDiff] = []
                for case_name, case_status in base_cases.items():
                    case_stats[case_name] = list_stat(matching_test_times(
-                        base_reports,
-                        suite_name,
-                        case_name,
-                        case_status,
+                        base_reports=base_reports,
+                        filename=filename,
+                        suite_name=suite_name,
+                        case_name=case_name,
+                        status=case_status,
                    ))
                    if case_name not in head_suite:
                        removed_cases.append({
@ -234,7 +342,7 @@ def analyze(
                    if head_case_name in base_cases:
                        stat = case_stats[head_case_name]
                        base_status = base_cases[head_case_name]
-                        if head_case[1] != base_status:
+                        if head_case['status'] != base_status:
                            modified_cases.append({
                                'margin': '!',
                                'name': head_case_name,
@ -278,10 +386,11 @@ def analyze(
        else:
            for case_name, case_status in base_cases.items():
                case_stats[case_name] = list_stat(matching_test_times(
-                    base_reports,
-                    suite_name,
-                    case_name,
-                    case_status,
+                    base_reports=base_reports,
+                    filename=filename,
+                    suite_name=suite_name,
+                    case_name=case_name,
+                    status=case_status,
                ))
                case_diffs.append({
                    'margin': ' ',
@ -316,9 +425,9 @@ def case_diff_lines(diff: CaseDiff) -> List[str]:

    now = diff['now']
    if now:
-        now_stat: Stat = {'center': now[0], 'spread': None}
+        now_stat: Stat = {'center': now['seconds'], 'spread': None}
        now_line = f'    # now {display_stat(now_stat, case_fmt)}'
-        now_status = now[1]
+        now_status = now['status']
        if now_status:
            now_line += f' ({now_status})'
        lines.append(now_line)
@ -410,7 +519,7 @@ def case_delta(case: CaseDiff) -> Stat:
    now = case['now']
    return recenter(
        was[0] if was else zero_stat(),
-        now[0] if now else 0,
+        now['seconds'] if now else 0,
    )


@ -542,7 +651,7 @@ class TestCase:
 class TestSuite:
    def __init__(self, name: str) -> None:
        self.name = name
-        self.test_cases: List[TestCase] = []
+        self.test_cases: Dict[str, TestCase] = dict()
        self.failed_count = 0
        self.skipped_count = 0
        self.errored_count = 0
@ -555,14 +664,14 @@ class TestSuite:
        return f'TestSuite({rc})'

    def append(self, test_case: TestCase) -> None:
-        self.test_cases.append(test_case)
+        self.test_cases[test_case.name] = test_case
        self.total_time += test_case.time
        self.failed_count += 1 if test_case.failed else 0
        self.skipped_count += 1 if test_case.skipped else 0
        self.errored_count += 1 if test_case.errored else 0

    def print_report(self, num_longest: int = 3) -> None:
-        sorted_tests = sorted(self.test_cases, key=lambda x: x.time)
+        sorted_tests = sorted(self.test_cases.values(), key=lambda x: x.time)
        test_count = len(sorted_tests)
        print(f"class {self.name}:")
        print(f"    tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} errored: {self.errored_count}")
@ -577,25 +686,48 @@ class TestSuite:
        print("")


+class TestFile:
+    def __init__(self, name: str) -> None:
+        self.name = name
+        self.total_time = 0.0
+        self.test_suites: Dict[str, TestSuite] = dict()
+
+    def append(self, test_case: TestCase) -> None:
+        suite_name = test_case.class_name
+        if suite_name not in self.test_suites:
+            self.test_suites[suite_name] = TestSuite(suite_name)
+        if test_case.name in self.test_suites[suite_name].test_cases:
+            # This behaviour is expected for test_cpp_extensions_aot, distributed/test_distributed_fork, 
+            # and distributed/test_distributed_spawn. In these cases, we just lump the duplicate tests together--
+            # which is admittedly inaccurate for test_cpp_extensions_aot, though this is negligible as the test is short.
+            # For other unexpected cases, we should raise a warning.
+            if self.name != 'test_cpp_extensions_aot' and \
+               self.name != 'distributed/test_distributed_fork' and \
+               self.name != 'distributed/test_distributed_spawn' and \
+               self.name != 'cpp':  # Also allow this cpp one as it run twice in caffe2 ort jobs
+                raise RuntimeWarning(f'Duplicate test case {test_case.name} in suite {suite_name} called from {self.name}')
+        self.test_suites[suite_name].append(test_case)
+        self.total_time += test_case.time
+
+
 def parse_report(path: str) -> Iterator[TestCase]:
    dom = minidom.parse(path)
    for test_case in dom.getElementsByTagName('testcase'):
        yield TestCase(test_case)


-def parse_reports(folder: str) -> Dict[str, TestSuite]:
+def parse_reports(folder: str) -> Dict[str, TestFile]:
    reports = glob(os.path.join(folder, '**', '*.xml'), recursive=True)
-    tests_by_class = dict()
+    tests_by_file = dict()
    for report in reports:
+        test_filename = re.sub(r'\.', '/', os.path.basename(os.path.dirname(report)))
+        if test_filename not in tests_by_file:
+            tests_by_file[test_filename] = TestFile(test_filename)
        for test_case in parse_report(report):
-            class_name = test_case.class_name
-            if class_name not in tests_by_class:
-                tests_by_class[class_name] = TestSuite(class_name)
-            tests_by_class[class_name].append(test_case)
-    return tests_by_class
+            tests_by_file[test_filename].append(test_case)
+    return tests_by_file

-
-def build_info() -> ReportMeta:
+def build_info() -> ReportMetaMeta:
    return {
        "build_pr": os.environ.get("CIRCLE_PR_NUMBER", ""),
        "build_tag": os.environ.get("CIRCLE_TAG", ""),
@ -624,7 +756,7 @@ def build_message(test_case: TestCase) -> Dict[str, Dict[str, Any]]:
    }


-def send_report_to_scribe(reports: Dict[str, TestSuite]) -> None:
+def send_report_to_scribe(reports: Dict[str, TestFile]) -> None:
    access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")

    if not access_token:
@ -643,8 +775,9 @@ def send_report_to_scribe(reports: Dict[str, TestSuite]) -> None:
                        "message": json.dumps(build_message(test_case)),
                        "line_escape": False,
                    }
-                    for name in sorted(reports.keys())
-                    for test_case in reports[name].test_cases
+                    for test_file in reports.values()
+                    for test_suite in test_file.test_suites.values()
+                    for test_case in test_suite.test_cases.values()
                ]
            ),
        },
@ -653,33 +786,40 @@ def send_report_to_scribe(reports: Dict[str, TestSuite]) -> None:


 def assemble_s3_object(
-    reports: Dict[str, TestSuite],
+    reports: Dict[str, TestFile],
    *,
    total_seconds: float,
-) -> Report:
+) -> Version2Report:
    return {
        **build_info(),  # type: ignore[misc]
        'total_seconds': total_seconds,
-        'suites': {
+        'format_version': 2,
+        'files' : {
            name: {
-                'total_seconds': suite.total_time,
-                'cases': [
-                    {
-                        'name': case.name,
-                        'seconds': case.time,
-                        'errored': case.errored,
-                        'failed': case.failed,
-                        'skipped': case.skipped,
+                'total_seconds': test_file.total_time,
+                'filename': test_file.name,
+                'suites': {
+                    name: {
+                        'total_seconds': suite.total_time,
+                        'cases': {
+                            name: {
+                                'seconds': case.time,
+                                'status': 'skipped' if case.skipped else 
+                                          'errored' if case.errored else 
+                                          'failed' if case.failed else None
+                            }
+                            for name, case in suite.test_cases.items()
+                        },
                    }
-                    for case in suite.test_cases
-                ],
+                    for name, suite in test_file.test_suites.items()
+                }
            }
-            for name, suite in reports.items()
+            for name, test_file in reports.items()
        }
    }


-def send_report_to_s3(head_report: Report) -> None:
+def send_report_to_s3(head_report: Version2Report) -> None:
    job = os.environ.get('CIRCLE_JOB')
    sha1 = os.environ.get('CIRCLE_SHA1')
    branch = os.environ.get('CIRCLE_BRANCH', '')
@ -773,6 +913,13 @@ def positive_float(value: str) -> float:
    return parsed


+def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
+    for test_file in reports.values():
+        for test_suite in test_file.test_suites.values():
+            if len(test_suite.test_cases) > 0:
+                return False
+    return True
+
 if __name__ == '__main__':
    import argparse
    import sys
@ -830,24 +977,25 @@ if __name__ == '__main__':
    )
    args = parser.parse_args()

-    reports = parse_reports(args.folder)
-    if len(reports) == 0:
-        print(f"No test reports found in {args.folder}")
+    reports_by_file = parse_reports(args.folder)
+    if reports_has_no_tests(reports_by_file):
+        print(f"No tests in reports found in {args.folder}")
        sys.exit(0)

-    send_report_to_scribe(reports)
+    send_report_to_scribe(reports_by_file)

-    longest_tests = []
+    # longest_tests can contain duplicatesas the same tests can be spawned from different files
+    longest_tests : List[TestCase] = []
    total_time = 0.0
-    for name in sorted(reports.keys()):
-        test_suite = reports[name]
-        if test_suite.total_time >= args.class_print_threshold:
-            test_suite.print_report(args.longest_of_class)
-        total_time += test_suite.total_time
-        longest_tests.extend(test_suite.test_cases)
+    for filename, test_filename in reports_by_file.items():
+        for suite_name, test_suite in test_filename.test_suites.items():
+            if test_suite.total_time >= args.class_print_threshold:
+                test_suite.print_report(args.longest_of_class)
+                total_time += test_suite.total_time
+                longest_tests.extend(test_suite.test_cases.values())
    longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:]

-    obj = assemble_s3_object(reports, total_seconds=total_time)
+    obj = assemble_s3_object(reports_by_file, total_seconds=total_time)

    if args.upload_to_s3:
        send_report_to_s3(obj)