diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 364675b9011..0d5873d357f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -172,19 +172,24 @@ test_libtorch() {
     # Start background download
     python tools/download_mnist.py --quiet -d test/cpp/api/mnist &
 
+    # Make test_reports directory
+    # NB: the ending test_libtorch must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
+    mkdir -p $TEST_REPORTS_DIR
+
     # Run JIT cpp tests
-    mkdir -p test/test-reports/cpp-unittest
     python test/cpp/jit/tests_setup.py setup
     if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-      build/bin/test_jit  --gtest_output=xml:test/test-reports/cpp-unittest/test_jit.xml
+      build/bin/test_jit  --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
     else
-      build/bin/test_jit  --gtest_filter='-*CUDA' --gtest_output=xml:test/test-reports/cpp-unittest/test_jit.xml
+      build/bin/test_jit  --gtest_filter='-*CUDA' --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
     fi
     python test/cpp/jit/tests_setup.py shutdown
     # Wait for background download to finish
     wait
-    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api --gtest_output=xml:test/test-reports/cpp-unittest/test_api.xml
-    build/bin/test_tensorexpr --gtest_output=xml:test/test-reports/cpp-unittests/test_tensorexpr.xml
+    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
+    build/bin/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
     assert_git_not_dirty
   fi
 }
@@ -192,30 +197,39 @@ test_libtorch() {
 test_vulkan() {
   if [[ "$BUILD_ENVIRONMENT" == *vulkan-linux* ]]; then
     export VK_ICD_FILENAMES=/var/lib/jenkins/swiftshader/build/Linux/vk_swiftshader_icd.json
-    mkdir -p test/test-reports/cpp-vulkan
-    build/bin/vulkan_test --gtest_output=xml:test/test-reports/cpp-vulkan/vulkan_test.xml
+    # NB: the ending test_vulkan must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-vulkan/test_vulkan
+    mkdir -p $TEST_REPORTS_DIR
+    build/bin/vulkan_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml
   fi
 }
 
 test_distributed() {
   if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
     echo "Testing distributed C++ tests"
-    mkdir -p test/test-reports/cpp-distributed
-    build/bin/FileStoreTest --gtest_output=xml:test/test-reports/cpp-distributed/FileStoreTest.xml
-    build/bin/HashStoreTest --gtest_output=xml:test/test-reports/cpp-distributed/HashStoreTest.xml
-    build/bin/TCPStoreTest --gtest_output=xml:test/test-reports/cpp-distributed/TCPStoreTest.xml
+    # NB: the ending test_distributed must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-distributed/test_distributed
+    mkdir -p $TEST_REPORTS_DIR
+    build/bin/FileStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/FileStoreTest.xml
+    build/bin/HashStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/HashStoreTest.xml
+    build/bin/TCPStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/TCPStoreTest.xml
 
-    build/bin/ProcessGroupGlooTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupGlooTest.xml
-    build/bin/ProcessGroupNCCLTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupNCCLTest.xml
-    build/bin/ProcessGroupNCCLErrorsTest --gtest_output=xml:test/test-reports/cpp-distributed/ProcessGroupNCCLErrorsTest.xml
+    build/bin/ProcessGroupGlooTest --gtest_output=xml:$TEST_REPORTS_DIR/ProcessGroupGlooTest.xml
+    build/bin/ProcessGroupNCCLTest --gtest_output=xml:$TEST_REPORTS_DIR/ProcessGroupNCCLTest.xml
+    build/bin/ProcessGroupNCCLErrorsTest --gtest_output=xml:$TEST_REPORTS_DIR/ProcessGroupNCCLErrorsTest.xml
   fi
 }
 
 test_rpc() {
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
     echo "Testing RPC C++ tests"
-    mkdir -p test/test-reports/cpp-rpc
-    build/bin/test_cpp_rpc --gtest_output=xml:test/test-reports/cpp-rpc/test_cpp_rpc.xml
+    # NB: the ending test_rpc must match the current function name for the current
+    # test reporting process (in print_test_stats.py) to function as expected.
+    TEST_REPORTS_DIR=test/test-reports/cpp-rpc/test_rpc
+    mkdir -p $TEST_REPORTS_DIR
+    build/bin/test_cpp_rpc --gtest_output=xml:$TEST_REPORTS_DIR/test_cpp_rpc.xml
   fi
 }
 
diff --git a/test/test_testing.py b/test/test_testing.py
index 588979f652c..e1722ba785c 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -649,6 +649,17 @@ def fakehash(char):
     return char * 40
 
 
+def dummy_meta_meta() -> print_test_stats.ReportMetaMeta:
+    return {
+        'build_pr': '',
+        'build_tag': '',
+        'build_sha1': '',
+        'build_branch': '',
+        'build_job': '',
+        'build_workflow_id': '',
+    }
+
+
 def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
     return {
         'name': name,
@@ -659,7 +670,7 @@ def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
     }
 
 
-def makereport(tests):
+def make_report_v1(tests) -> print_test_stats.Version1Report:
     suites = {
         suite_name: {
             'total_seconds': sum(case['seconds'] for case in cases),
@@ -668,59 +679,201 @@ def makereport(tests):
         for suite_name, cases in tests.items()
     }
     return {
+        **dummy_meta_meta(),
         'total_seconds': sum(s['total_seconds'] for s in suites.values()),
         'suites': suites,
     }
 
 
+def make_case_v2(seconds, status=None) -> print_test_stats.Version2Case:
+    return {
+        'seconds': seconds,
+        'status': status,
+    }
+
+
+def make_report_v2(tests) -> print_test_stats.Version2Report:
+    files = {}
+    for file_name, file_suites in tests.items():
+        suites = {
+            suite_name: {
+                'total_seconds': sum(case['seconds'] for case in cases.values()),
+                'cases': cases,
+            }
+            for suite_name, cases in file_suites.items()
+        }
+        files[file_name] = {
+            'suites': suites,
+            'total_seconds': sum(suite['total_seconds'] for suite in suites.values()),
+        }
+    return {
+        **dummy_meta_meta(),
+        'format_version': 2,
+        'total_seconds': sum(s['total_seconds'] for s in files.values()),
+        'files': files,
+    }
+
+
 class TestPrintTestStats(TestCase):
     maxDiff = None
 
-    def test_analysis(self):
-        head_report = makereport({
-            # input ordering of the suites is ignored
-            'Grault': [
-                # not printed: status same and time similar
-                makecase('test_grault0', 4.78, failed=True),
-                # status same, but time increased a lot
-                makecase('test_grault2', 1.473, errored=True),
-            ],
-            # individual tests times changed, not overall suite
-            'Qux': [
-                # input ordering of the test cases is ignored
-                makecase('test_qux1', 0.001, skipped=True),
-                makecase('test_qux6', 0.002, skipped=True),
-                # time in bounds, but status changed
-                makecase('test_qux4', 7.158, failed=True),
-                # not printed because it's the same as before
-                makecase('test_qux7', 0.003, skipped=True),
-                makecase('test_qux5', 11.968),
-                makecase('test_qux3', 23.496),
-            ],
-            # new test suite
-            'Bar': [
-                makecase('test_bar2', 3.742, failed=True),
-                makecase('test_bar1', 50.447),
-            ],
-            # overall suite time changed but no individual tests
-            'Norf': [
-                makecase('test_norf1', 3),
-                makecase('test_norf2', 3),
-                makecase('test_norf3', 3),
-                makecase('test_norf4', 3),
-            ],
-            # suite doesn't show up if it doesn't change enough
-            'Foo': [
-                makecase('test_foo1', 42),
-                makecase('test_foo2', 56),
-            ],
+    version1_report: print_test_stats.Version1Report = make_report_v1({
+        # input ordering of the suites is ignored
+        'Grault': [
+            # not printed: status same and time similar
+            makecase('test_grault0', 4.78, failed=True),
+            # status same, but time increased a lot
+            makecase('test_grault2', 1.473, errored=True),
+        ],
+        # individual tests times changed, not overall suite
+        'Qux': [
+            # input ordering of the test cases is ignored
+            makecase('test_qux1', 0.001, skipped=True),
+            makecase('test_qux6', 0.002, skipped=True),
+            # time in bounds, but status changed
+            makecase('test_qux4', 7.158, failed=True),
+            # not printed because it's the same as before
+            makecase('test_qux7', 0.003, skipped=True),
+            makecase('test_qux5', 11.968),
+            makecase('test_qux3', 23.496),
+        ],
+        # new test suite
+        'Bar': [
+            makecase('test_bar2', 3.742, failed=True),
+            makecase('test_bar1', 50.447),
+        ],
+        # overall suite time changed but no individual tests
+        'Norf': [
+            makecase('test_norf1', 3),
+            makecase('test_norf2', 3),
+            makecase('test_norf3', 3),
+            makecase('test_norf4', 3),
+        ],
+        # suite doesn't show up if it doesn't change enough
+        'Foo': [
+            makecase('test_foo1', 42),
+            makecase('test_foo2', 56),
+        ],
+    })
+
+    version2_report: print_test_stats.Version2Report = make_report_v2(
+        {
+            'test_a': {
+                'Grault': {
+                    'test_grault0': make_case_v2(4.78, 'failed'),
+                    'test_grault2': make_case_v2(1.473, 'errored'),
+                },
+                'Qux': {
+                    'test_qux1': make_case_v2(0.001, 'skipped'),
+                    'test_qux6': make_case_v2(0.002, 'skipped'),
+                    'test_qux4': make_case_v2(7.158, 'failed'),
+                    'test_qux7': make_case_v2(0.003, 'skipped'),
+                    'test_qux8': make_case_v2(11.968),
+                    'test_qux3': make_case_v2(23.496),
+                }
+            },
+            'test_b': {
+                'Bar': {
+                    'test_bar2': make_case_v2(3.742, 'failed'),
+                    'test_bar1': make_case_v2(50.447),
+                },
+                # overall suite time changed but no individual tests
+                'Norf': {
+                    'test_norf1': make_case_v2(3),
+                    'test_norf2': make_case_v2(3),
+                    'test_norf3': make_case_v2(3),
+                    'test_norf4': make_case_v2(3),
+                },
+            },
+            'test_c': {
+                'Foo': {
+                    'test_foo1': make_case_v2(42),
+                    'test_foo2': make_case_v2(56),
+                },
+            }
         })
 
+    def test_simplify(self):
+        self.assertEqual(
+            {
+                '': {
+                    'Bar': {
+                        'test_bar1': {'seconds': 50.447, 'status': None},
+                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                    },
+                    'Foo': {
+                        'test_foo1': {'seconds': 42, 'status': None},
+                        'test_foo2': {'seconds': 56, 'status': None},
+                    },
+                    'Grault': {
+                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
+                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    },
+                    'Norf': {
+                        'test_norf1': {'seconds': 3, 'status': None},
+                        'test_norf3': {'seconds': 3, 'status': None},
+                        'test_norf2': {'seconds': 3, 'status': None},
+                        'test_norf4': {'seconds': 3, 'status': None},
+                    },
+                    'Qux': {
+                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
+                        'test_qux3': {'seconds': 23.496, 'status': None},
+                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
+                        'test_qux5': {'seconds': 11.968, 'status': None},
+                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
+                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                    },
+                },
+            },
+            print_test_stats.simplify(self.version1_report)
+        )
+
+        self.assertEqual(
+            {
+                'test_a': {
+                    'Grault': {
+                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
+                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    },
+                    'Qux': {
+                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
+                        'test_qux3': {'seconds': 23.496, 'status': None},
+                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
+                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
+                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                        'test_qux8': {'seconds': 11.968, 'status': None},
+                    },
+                },
+                'test_b': {
+                    'Bar': {
+                        'test_bar1': {'seconds': 50.447, 'status': None},
+                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                    },
+                    'Norf': {
+                        'test_norf1': {'seconds': 3, 'status': None},
+                        'test_norf2': {'seconds': 3, 'status': None},
+                        'test_norf3': {'seconds': 3, 'status': None},
+                        'test_norf4': {'seconds': 3, 'status': None},
+                    },
+                },
+                'test_c': {
+                    'Foo': {
+                        'test_foo1': {'seconds': 42, 'status': None},
+                        'test_foo2': {'seconds': 56, 'status': None},
+                    },
+                },
+            },
+            print_test_stats.simplify(self.version2_report),
+        )
+
+    def test_analysis(self):
+        head_report = self.version1_report
+
         base_reports = {
             # bbbb has no reports, so base is cccc instead
             fakehash('b'): [],
             fakehash('c'): [
-                makereport({
+                make_report_v1({
                     'Baz': [
                         makecase('test_baz2', 13.605),
                         # no recent suites have & skip this test
@@ -753,7 +906,7 @@ class TestPrintTestStats(TestCase):
                 }),
             ],
             fakehash('d'): [
-                makereport({
+                make_report_v1({
                     'Foo': [
                         makecase('test_foo1', 40),
                         # removed in cccc
@@ -783,7 +936,7 @@ class TestPrintTestStats(TestCase):
             ],
             fakehash('e'): [],
             fakehash('f'): [
-                makereport({
+                make_report_v1({
                     'Foo': [
                         makecase('test_foo3', 24),
                         makecase('test_foo1', 43),
@@ -1066,14 +1219,14 @@ Added    (across    1 suite)      1 test,  totaling +   3.00s
 ''',
             print_test_stats.regression_info(
                 head_sha=fakehash('a'),
-                head_report=makereport({
+                head_report=make_report_v1({
                     'Foo': [
                         makecase('test_foo', 0.02, skipped=True),
                         makecase('test_baz', 3),
                     ]}),
                 base_reports={
                     fakehash('b'): [
-                        makereport({
+                        make_report_v1({
                             'Foo': [
                                 makecase('test_foo', 40),
                                 makecase('test_bar', 1),
@@ -1081,7 +1234,7 @@ Added    (across    1 suite)      1 test,  totaling +   3.00s
                         }),
                     ],
                     fakehash('c'): [
-                        makereport({
+                        make_report_v1({
                             'Foo': [
                                 makecase('test_foo', 43),
                             ],
@@ -1135,7 +1288,7 @@ Added    (across    1 suite)      2 tests, totaling +   3.02s
 ''',
             print_test_stats.regression_info(
                 head_sha=fakehash('a'),
-                head_report=makereport({
+                head_report=make_report_v1({
                     'Foo': [
                         makecase('test_foo', 0.02, skipped=True),
                         makecase('test_baz', 3),
diff --git a/test/test_utils.py b/test/test_utils.py
index 49d662d8f4c..78ca0fc8b3e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -289,7 +289,7 @@ class TestCheckpoint(TestCase):
             out = checkpoint(run_fn2, input_var, input_var2)
             out.sum().backward()
 
-class TestDataLoader(TestCase):
+class TestDataLoaderUtils(TestCase):
     def setUp(self):
         self.dataset = torch.randn(5, 3, 3, 2)
         self.batch_size = 3
diff --git a/tools/test_history.py b/tools/test_history.py
index e941f148bfe..352d9c7b17d 100755
--- a/tools/test_history.py
+++ b/tools/test_history.py
@@ -6,11 +6,11 @@ import json
 import subprocess
 from collections import defaultdict
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast
 
 import boto3  # type: ignore[import]
 import botocore  # type: ignore[import]
-from typing_extensions import TypedDict
+from typing_extensions import Literal, TypedDict
 
 
 def get_git_commit_history(
@@ -36,31 +36,70 @@ def get_object_summaries(*, bucket: Any, sha: str) -> Dict[str, List[Any]]:
     return dict(by_job)
 
 
-class Case(TypedDict):
-    name: str
+# TODO: consolidate these typedefs with the identical ones in
+# torch/testing/_internal/print_test_stats.py
+
+Commit = str  # 40-digit SHA-1 hex string
+Status = Optional[Literal['errored', 'failed', 'skipped']]
+
+
+class CaseMeta(TypedDict):
     seconds: float
+
+
+class Version1Case(CaseMeta):
+    name: str
     errored: bool
     failed: bool
     skipped: bool
 
 
-class Suite(TypedDict):
+class Version1Suite(TypedDict):
     total_seconds: float
-    cases: List[Case]
+    cases: List[Version1Case]
 
 
-class ReportMeta(TypedDict):
+class ReportMetaMeta(TypedDict):
     build_pr: str
     build_tag: str
-    build_sha1: str
+    build_sha1: Commit
     build_branch: str
     build_job: str
     build_workflow_id: str
 
 
-class Report(ReportMeta):
+class ReportMeta(ReportMetaMeta):
     total_seconds: float
-    suites: Dict[str, Suite]
+
+
+class Version1Report(ReportMeta):
+    suites: Dict[str, Version1Suite]
+
+
+class Version2Case(CaseMeta):
+    status: Status
+
+
+class Version2Suite(TypedDict):
+    total_seconds: float
+    cases: Dict[str, Version2Case]
+
+
+class Version2File(TypedDict):
+    total_seconds: float
+    suites: Dict[str, Version2Suite]
+
+
+class VersionedReport(ReportMeta):
+    format_version: int
+
+
+# report: Version2Report implies report['format_version'] == 2
+class Version2Report(VersionedReport):
+    files: Dict[str, Version2File]
+
+
+Report = Union[Version1Report, VersionedReport]
 
 
 def get_jsons(
@@ -77,32 +116,63 @@ def get_jsons(
     }
 
 
+# TODO: consolidate this with the case_status function from
+# torch/testing/_internal/print_test_stats.py
+def case_status(case: Version1Case) -> Status:
+    for k in {'errored', 'failed', 'skipped'}:
+        if case[k]:  # type: ignore[misc]
+            return cast(Status, k)
+    return None
+
+
+# TODO: consolidate this with the newify_case function from
+# torch/testing/_internal/print_test_stats.py
+def newify_case(case: Version1Case) -> Version2Case:
+    return {
+        'seconds': case['seconds'],
+        'status': case_status(case),
+    }
+
+
+# TODO: consolidate this with the simplify function from
+# torch/testing/_internal/print_test_stats.py
 def get_cases(
     *,
     data: Report,
+    filename: Optional[str],
     suite_name: Optional[str],
     test_name: str,
-) -> List[Case]:
-    cases = []
-    suites = data['suites']
-    for name, suite in suites.items():
-        if name == suite_name or not suite_name:
-            for case in suite['cases']:
-                if case['name'] == test_name:
-                    cases.append(case)
+) -> List[Version2Case]:
+    cases: List[Version2Case] = []
+    if 'format_version' not in data:  # version 1 implicitly
+        v1report = cast(Version1Report, data)
+        suites = v1report['suites']
+        for sname, v1suite in suites.items():
+            if sname == suite_name or not suite_name:
+                for v1case in v1suite['cases']:
+                    if v1case['name'] == test_name:
+                        cases.append(newify_case(v1case))
+    else:
+        v_report = cast(VersionedReport, data)
+        version = v_report['format_version']
+        if version == 2:
+            v2report = cast(Version2Report, v_report)
+            for fname, v2file in v2report['files'].items():
+                if fname == filename or not filename:
+                    for sname, v2suite in v2file['suites'].items():
+                        if sname == suite_name or not suite_name:
+                            v2case = v2suite['cases'].get(test_name)
+                            if v2case:
+                                cases.append(v2case)
+        else:
+            raise RuntimeError(f'Unknown format version: {version}')
     return cases
 
 
-def case_status(case: Case) -> Optional[str]:
-    for k in {'errored', 'failed', 'skipped'}:
-        if case[k]:  # type: ignore[misc]
-            return k
-    return None
-
-
 def make_column(
     *,
     data: Optional[Report],
+    filename: Optional[str],
     suite_name: Optional[str],
     test_name: str,
     digits: int,
@@ -112,12 +182,13 @@ def make_column(
     if data:
         cases = get_cases(
             data=data,
+            filename=filename,
             suite_name=suite_name,
             test_name=test_name
         )
         if cases:
             case = cases[0]
-            status = case_status(case)
+            status = case['status']
             omitted = len(cases) - 1
             if status:
                 return f'{status.rjust(num_length)} ', omitted
@@ -134,6 +205,7 @@ def make_columns(
     jobs: List[str],
     jsons: Dict[str, Report],
     omitted: Dict[str, int],
+    filename: Optional[str],
     suite_name: Optional[str],
     test_name: str,
     digits: int,
@@ -145,6 +217,7 @@ def make_columns(
         data = jsons.get(job)
         column, omitted_suites = make_column(
             data=data,
+            filename=filename,
             suite_name=suite_name,
             test_name=test_name,
             digits=digits,
@@ -165,6 +238,7 @@ def make_lines(
     jobs: Set[str],
     jsons: Dict[str, Report],
     omitted: Dict[str, int],
+    filename: Optional[str],
     suite_name: Optional[str],
     test_name: str,
 ) -> List[str]:
@@ -172,12 +246,13 @@ def make_lines(
     for job, data in jsons.items():
         cases = get_cases(
             data=data,
+            filename=filename,
             suite_name=suite_name,
             test_name=test_name,
         )
         if cases:
             case = cases[0]
-            status = case_status(case)
+            status = case['status']
             line = f'{job} {case["seconds"]}s{f" {status}" if status else ""}'
             if job in omitted and omitted[job] > 0:
                 line += f' ({omitted[job]} S3 reports omitted)'
@@ -197,6 +272,7 @@ def display_history(
     bucket: Any,
     commits: List[Tuple[str, datetime]],
     jobs: Optional[List[str]],
+    filename: Optional[str],
     suite_name: Optional[str],
     test_name: str,
     delta: int,
@@ -226,6 +302,7 @@ def display_history(
                 jobs=jobs,
                 jsons=jsons,
                 omitted=omitted,
+                filename=filename,
                 suite_name=suite_name,
                 test_name=test_name,
                 digits=digits,
@@ -236,6 +313,7 @@ def display_history(
                 jobs=set(jobs or []),
                 jsons=jsons,
                 omitted=omitted,
+                filename=filename,
                 suite_name=suite_name,
                 test_name=test_name,
             )
@@ -352,6 +430,10 @@ indicated test was not found in that report.
         action='store_true',
         help='(multiline) ignore listed jobs, show all jobs for each commit',
     )
+    parser.add_argument(
+        '--file',
+        help='name of the file containing the test',
+    )
     parser.add_argument(
         '--suite',
         help='name of the suite containing the test',
@@ -381,6 +463,7 @@ indicated test was not found in that report.
         bucket=bucket,
         commits=commits,
         jobs=jobs,
+        filename=args.file,
         suite_name=args.suite,
         test_name=args.test,
         delta=args.delta,
diff --git a/torch/testing/_internal/print_test_stats.py b/torch/testing/_internal/print_test_stats.py
index 062c9bf7ffb..ce5d4e806f7 100755
--- a/torch/testing/_internal/print_test_stats.py
+++ b/torch/testing/_internal/print_test_stats.py
@@ -5,6 +5,7 @@ import datetime
 import json
 import math
 import os
+import re
 import statistics
 import subprocess
 import time
@@ -12,11 +13,11 @@ from collections import defaultdict
 from glob import glob
 from pathlib import Path
 from typing import (Any, DefaultDict, Dict, Iterable, Iterator, List, Optional,
-                    Tuple)
+                    Set, Tuple, Union, cast)
 from xml.dom import minidom  # type: ignore[import]
 
 import requests
-from typing_extensions import TypedDict
+from typing_extensions import Literal, TypedDict
 
 try:
     import boto3  # type: ignore[import]
@@ -24,29 +25,30 @@ try:
 except ImportError:
     HAVE_BOTO3 = False
 
+# TODO: consolidate these typedefs with the identical ones in
+# tools/test_history.py
+
 Commit = str  # 40-digit SHA-1 hex string
-Status = Optional[str]  # errored, failed, skipped, or None
-
-# represent suite as dict because indexing is useful
-SimplerCase = Tuple[float, Status]
-SimplerSuite = Dict[str, SimplerCase]
-SimplerReport = Dict[str, SimplerSuite]
+Status = Optional[Literal['errored', 'failed', 'skipped']]
 
 
-class Case(TypedDict):
-    name: str
+class CaseMeta(TypedDict):
     seconds: float
+
+
+class Version1Case(CaseMeta):
+    name: str
     errored: bool
     failed: bool
     skipped: bool
 
 
-class Suite(TypedDict):
+class Version1Suite(TypedDict):
     total_seconds: float
-    cases: List[Case]
+    cases: List[Version1Case]
 
 
-class ReportMeta(TypedDict, total=False):
+class ReportMetaMeta(TypedDict):
     build_pr: str
     build_tag: str
     build_sha1: Commit
@@ -55,9 +57,42 @@ class ReportMeta(TypedDict, total=False):
     build_workflow_id: str
 
 
-class Report(ReportMeta):
+class ReportMeta(ReportMetaMeta):
     total_seconds: float
-    suites: Dict[str, Suite]
+
+
+class Version1Report(ReportMeta):
+    suites: Dict[str, Version1Suite]
+
+
+class Version2Case(CaseMeta):
+    status: Status
+
+
+class Version2Suite(TypedDict):
+    total_seconds: float
+    cases: Dict[str, Version2Case]
+
+
+class Version2File(TypedDict):
+    total_seconds: float
+    suites: Dict[str, Version2Suite]
+
+
+class VersionedReport(ReportMeta):
+    format_version: int
+
+
+# report: Version2Report implies report['format_version'] == 2
+class Version2Report(VersionedReport):
+    files: Dict[str, Version2File]
+
+
+Report = Union[Version1Report, VersionedReport]
+
+SimplerSuite = Dict[str, Version2Case]
+SimplerFile = Dict[str, SimplerSuite]
+SimplerReport = Dict[str, SimplerFile]
 
 
 class Stat(TypedDict):
@@ -69,7 +104,7 @@ class CaseDiff(TypedDict):
     margin: str
     name: str
     was: Optional[Tuple[Stat, Status]]
-    now: Optional[SimplerCase]
+    now: Optional[Version2Case]
 
 
 class SuiteDiff(TypedDict):
@@ -80,23 +115,78 @@ class SuiteDiff(TypedDict):
     cases: List[CaseDiff]
 
 
-def case_status(case: Case) -> Status:
+# TODO: consolidate this with the case_status function from
+# tools/test_history.py
+def case_status(case: Version1Case) -> Status:
     for k in {'errored', 'failed', 'skipped'}:
         if case[k]:  # type: ignore[misc]
-            return k
+            return cast(Status, k)
     return None
 
 
-def simplify(report: Report) -> SimplerReport:
+# TODO: consolidate this with the newify_case function from
+# tools/test_history.py
+def newify_case(case: Version1Case) -> Version2Case:
     return {
-        suite_name: {
-            case['name']: (case['seconds'], case_status(case))
-            for case in suite['cases']
-        }
-        for suite_name, suite in report['suites'].items()
+        'seconds': case['seconds'],
+        'status': case_status(case),
     }
 
 
+# TODO: consolidate this with the get_cases function from
+# tools/test_history.py
+
+# Here we translate to a three-layer format (file -> suite -> case)
+# rather than a two-layer format (suite -> case) because as mentioned in
+# a comment in the body of this function, if we consolidate suites that
+# share a name, there will be test case name collisions, and once we
+# have those, there's no clean way to deal with it in the diffing logic.
+# It's not great to have to add a dummy empty string for the filename
+# for version 1 reports, but it's better than either losing cases that
+# share a name (for version 2 reports) or using a list of cases rather
+# than a dict.
+def simplify(report: Report) -> SimplerReport:
+    if 'format_version' not in report:  # version 1 implicitly
+        v1report = cast(Version1Report, report)
+        return {
+            # we just don't have test filename information sadly, so we
+            # just make one fake filename that is the empty string
+            '': {
+                suite_name: {
+                    # This clobbers some cases that have duplicate names
+                    # because in version 1, we would merge together all
+                    # the suites with a given name (even if they came
+                    # from different files), so there were actually
+                    # situations in which two cases in the same suite
+                    # shared a name (because they actually originally
+                    # came from two suites that were then merged). It
+                    # would probably be better to warn about the cases
+                    # that we're silently discarding here, but since
+                    # we're only uploading in the new format (where
+                    # everything is also keyed by filename) going
+                    # forward, it shouldn't matter too much.
+                    case['name']: newify_case(case)
+                    for case in suite['cases']
+                }
+                for suite_name, suite in v1report['suites'].items()
+            }
+        }
+    else:
+        v_report = cast(VersionedReport, report)
+        version = v_report['format_version']
+        if version == 2:
+            v2report = cast(Version2Report, v_report)
+            return {
+                filename: {
+                    suite_name: suite['cases']
+                    for suite_name, suite in file_data['suites'].items()
+                }
+                for filename, file_data in v2report['files'].items()
+            }
+        else:
+            raise RuntimeError(f'Unknown format version: {version}')
+
+
 def plural(n: int) -> str:
     return '' if n == 1 else 's'
 
@@ -165,7 +255,9 @@ def unlines(lines: List[str]) -> str:
 
 
 def matching_test_times(
+    *,
     base_reports: Dict[Commit, List[SimplerReport]],
+    filename: str,
     suite_name: str,
     case_name: str,
     status: Status,
@@ -173,13 +265,16 @@ def matching_test_times(
     times: List[float] = []
     for reports in base_reports.values():
         for report in reports:
-            suite = report.get(suite_name)
-            if suite:
-                case = suite.get(case_name)
-                if case:
-                    t, s = case
-                    if s == status:
-                        times.append(t)
+            file_data = report.get(filename)
+            if file_data:
+                suite = file_data.get(suite_name)
+                if suite:
+                    case = suite.get(case_name)
+                    if case:
+                        t = case['seconds']
+                        s = case['status']
+                        if s == status:
+                            times.append(t)
     return times
 
 
@@ -195,30 +290,43 @@ def analyze(
 
     # find all relevant suites (those in either base or head or both)
     all_reports = [head_report] + base_report
-    all_suites = {k for r in all_reports for k in r.keys()}
+    all_suites: Set[Tuple[str, str]] = {
+        (filename, suite_name)
+        for r in all_reports
+        for filename, file_data in r.items()
+        for suite_name in file_data.keys()
+    }
 
     removed_suites: List[SuiteDiff] = []
     modified_suites: List[SuiteDiff] = []
     added_suites: List[SuiteDiff] = []
 
-    for suite_name in sorted(all_suites):
+    for filename, suite_name in sorted(all_suites):
         case_diffs: List[CaseDiff] = []
-        head_suite = head_report.get(suite_name)
+        head_suite = head_report.get(filename, {}).get(suite_name)
         base_cases: Dict[str, Status] = dict(sorted(set.intersection(*[
-            {(n, s) for n, (_, s) in report.get(suite_name, {}).items()}
+            {
+                (n, case['status'])
+                for n, case
+                in report.get(filename, {}).get(suite_name, {}).items()
+            }
             for report in base_report
         ] or [set()])))
         case_stats: Dict[str, Stat] = {}
         if head_suite:
-            now = sum(case[0] for case in head_suite.values())
-            if any(suite_name in report for report in base_report):
+            now = sum(case['seconds'] for case in head_suite.values())
+            if any(
+                filename in report and suite_name in report[filename]
+                for report in base_report
+            ):
                 removed_cases: List[CaseDiff] = []
                 for case_name, case_status in base_cases.items():
                     case_stats[case_name] = list_stat(matching_test_times(
-                        base_reports,
-                        suite_name,
-                        case_name,
-                        case_status,
+                        base_reports=base_reports,
+                        filename=filename,
+                        suite_name=suite_name,
+                        case_name=case_name,
+                        status=case_status,
                     ))
                     if case_name not in head_suite:
                         removed_cases.append({
@@ -234,7 +342,7 @@ def analyze(
                     if head_case_name in base_cases:
                         stat = case_stats[head_case_name]
                         base_status = base_cases[head_case_name]
-                        if head_case[1] != base_status:
+                        if head_case['status'] != base_status:
                             modified_cases.append({
                                 'margin': '!',
                                 'name': head_case_name,
@@ -278,10 +386,11 @@ def analyze(
         else:
             for case_name, case_status in base_cases.items():
                 case_stats[case_name] = list_stat(matching_test_times(
-                    base_reports,
-                    suite_name,
-                    case_name,
-                    case_status,
+                    base_reports=base_reports,
+                    filename=filename,
+                    suite_name=suite_name,
+                    case_name=case_name,
+                    status=case_status,
                 ))
                 case_diffs.append({
                     'margin': ' ',
@@ -316,9 +425,9 @@ def case_diff_lines(diff: CaseDiff) -> List[str]:
 
     now = diff['now']
     if now:
-        now_stat: Stat = {'center': now[0], 'spread': None}
+        now_stat: Stat = {'center': now['seconds'], 'spread': None}
         now_line = f'    # now {display_stat(now_stat, case_fmt)}'
-        now_status = now[1]
+        now_status = now['status']
         if now_status:
             now_line += f' ({now_status})'
         lines.append(now_line)
@@ -410,7 +519,7 @@ def case_delta(case: CaseDiff) -> Stat:
     now = case['now']
     return recenter(
         was[0] if was else zero_stat(),
-        now[0] if now else 0,
+        now['seconds'] if now else 0,
     )
 
 
@@ -542,7 +651,7 @@ class TestCase:
 class TestSuite:
     def __init__(self, name: str) -> None:
         self.name = name
-        self.test_cases: List[TestCase] = []
+        self.test_cases: Dict[str, TestCase] = dict()
         self.failed_count = 0
         self.skipped_count = 0
         self.errored_count = 0
@@ -555,14 +664,14 @@ class TestSuite:
         return f'TestSuite({rc})'
 
     def append(self, test_case: TestCase) -> None:
-        self.test_cases.append(test_case)
+        self.test_cases[test_case.name] = test_case
         self.total_time += test_case.time
         self.failed_count += 1 if test_case.failed else 0
         self.skipped_count += 1 if test_case.skipped else 0
         self.errored_count += 1 if test_case.errored else 0
 
     def print_report(self, num_longest: int = 3) -> None:
-        sorted_tests = sorted(self.test_cases, key=lambda x: x.time)
+        sorted_tests = sorted(self.test_cases.values(), key=lambda x: x.time)
         test_count = len(sorted_tests)
         print(f"class {self.name}:")
         print(f"    tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} errored: {self.errored_count}")
@@ -577,25 +686,48 @@ class TestSuite:
         print("")
 
 
+class TestFile:
+    def __init__(self, name: str) -> None:
+        self.name = name
+        self.total_time = 0.0
+        self.test_suites: Dict[str, TestSuite] = dict()
+
+    def append(self, test_case: TestCase) -> None:
+        suite_name = test_case.class_name
+        if suite_name not in self.test_suites:
+            self.test_suites[suite_name] = TestSuite(suite_name)
+        if test_case.name in self.test_suites[suite_name].test_cases:
+            # This behaviour is expected for test_cpp_extensions_aot, distributed/test_distributed_fork, 
+            # and distributed/test_distributed_spawn. In these cases, we just lump the duplicate tests together--
+            # which is admittedly inaccurate for test_cpp_extensions_aot, though this is negligible as the test is short.
+            # For other unexpected cases, we should raise a warning.
+            if self.name != 'test_cpp_extensions_aot' and \
+               self.name != 'distributed/test_distributed_fork' and \
+               self.name != 'distributed/test_distributed_spawn' and \
+               self.name != 'cpp':  # Also allow this cpp one as it run twice in caffe2 ort jobs
+                raise RuntimeWarning(f'Duplicate test case {test_case.name} in suite {suite_name} called from {self.name}')
+        self.test_suites[suite_name].append(test_case)
+        self.total_time += test_case.time
+
+
 def parse_report(path: str) -> Iterator[TestCase]:
     dom = minidom.parse(path)
     for test_case in dom.getElementsByTagName('testcase'):
         yield TestCase(test_case)
 
 
-def parse_reports(folder: str) -> Dict[str, TestSuite]:
+def parse_reports(folder: str) -> Dict[str, TestFile]:
     reports = glob(os.path.join(folder, '**', '*.xml'), recursive=True)
-    tests_by_class = dict()
+    tests_by_file = dict()
     for report in reports:
+        test_filename = re.sub(r'\.', '/', os.path.basename(os.path.dirname(report)))
+        if test_filename not in tests_by_file:
+            tests_by_file[test_filename] = TestFile(test_filename)
         for test_case in parse_report(report):
-            class_name = test_case.class_name
-            if class_name not in tests_by_class:
-                tests_by_class[class_name] = TestSuite(class_name)
-            tests_by_class[class_name].append(test_case)
-    return tests_by_class
+            tests_by_file[test_filename].append(test_case)
+    return tests_by_file
 
-
-def build_info() -> ReportMeta:
+def build_info() -> ReportMetaMeta:
     return {
         "build_pr": os.environ.get("CIRCLE_PR_NUMBER", ""),
         "build_tag": os.environ.get("CIRCLE_TAG", ""),
@@ -624,7 +756,7 @@ def build_message(test_case: TestCase) -> Dict[str, Dict[str, Any]]:
     }
 
 
-def send_report_to_scribe(reports: Dict[str, TestSuite]) -> None:
+def send_report_to_scribe(reports: Dict[str, TestFile]) -> None:
     access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
 
     if not access_token:
@@ -643,8 +775,9 @@ def send_report_to_scribe(reports: Dict[str, TestSuite]) -> None:
                         "message": json.dumps(build_message(test_case)),
                         "line_escape": False,
                     }
-                    for name in sorted(reports.keys())
-                    for test_case in reports[name].test_cases
+                    for test_file in reports.values()
+                    for test_suite in test_file.test_suites.values()
+                    for test_case in test_suite.test_cases.values()
                 ]
             ),
         },
@@ -653,33 +786,40 @@ def send_report_to_scribe(reports: Dict[str, TestSuite]) -> None:
 
 
 def assemble_s3_object(
-    reports: Dict[str, TestSuite],
+    reports: Dict[str, TestFile],
     *,
     total_seconds: float,
-) -> Report:
+) -> Version2Report:
     return {
         **build_info(),  # type: ignore[misc]
         'total_seconds': total_seconds,
-        'suites': {
+        'format_version': 2,
+        'files' : {
             name: {
-                'total_seconds': suite.total_time,
-                'cases': [
-                    {
-                        'name': case.name,
-                        'seconds': case.time,
-                        'errored': case.errored,
-                        'failed': case.failed,
-                        'skipped': case.skipped,
+                'total_seconds': test_file.total_time,
+                'filename': test_file.name,
+                'suites': {
+                    name: {
+                        'total_seconds': suite.total_time,
+                        'cases': {
+                            name: {
+                                'seconds': case.time,
+                                'status': 'skipped' if case.skipped else 
+                                          'errored' if case.errored else 
+                                          'failed' if case.failed else None
+                            }
+                            for name, case in suite.test_cases.items()
+                        },
                     }
-                    for case in suite.test_cases
-                ],
+                    for name, suite in test_file.test_suites.items()
+                }
             }
-            for name, suite in reports.items()
+            for name, test_file in reports.items()
         }
     }
 
 
-def send_report_to_s3(head_report: Report) -> None:
+def send_report_to_s3(head_report: Version2Report) -> None:
     job = os.environ.get('CIRCLE_JOB')
     sha1 = os.environ.get('CIRCLE_SHA1')
     branch = os.environ.get('CIRCLE_BRANCH', '')
@@ -773,6 +913,13 @@ def positive_float(value: str) -> float:
     return parsed
 
 
+def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
+    for test_file in reports.values():
+        for test_suite in test_file.test_suites.values():
+            if len(test_suite.test_cases) > 0:
+                return False
+    return True
+
 if __name__ == '__main__':
     import argparse
     import sys
@@ -830,24 +977,25 @@ if __name__ == '__main__':
     )
     args = parser.parse_args()
 
-    reports = parse_reports(args.folder)
-    if len(reports) == 0:
-        print(f"No test reports found in {args.folder}")
+    reports_by_file = parse_reports(args.folder)
+    if reports_has_no_tests(reports_by_file):
+        print(f"No tests in reports found in {args.folder}")
         sys.exit(0)
 
-    send_report_to_scribe(reports)
+    send_report_to_scribe(reports_by_file)
 
-    longest_tests = []
+    # longest_tests can contain duplicatesas the same tests can be spawned from different files
+    longest_tests : List[TestCase] = []
     total_time = 0.0
-    for name in sorted(reports.keys()):
-        test_suite = reports[name]
-        if test_suite.total_time >= args.class_print_threshold:
-            test_suite.print_report(args.longest_of_class)
-        total_time += test_suite.total_time
-        longest_tests.extend(test_suite.test_cases)
+    for filename, test_filename in reports_by_file.items():
+        for suite_name, test_suite in test_filename.test_suites.items():
+            if test_suite.total_time >= args.class_print_threshold:
+                test_suite.print_report(args.longest_of_class)
+                total_time += test_suite.total_time
+                longest_tests.extend(test_suite.test_cases.values())
     longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:]
 
-    obj = assemble_s3_object(reports, total_seconds=total_time)
+    obj = assemble_s3_object(reports_by_file, total_seconds=total_time)
 
     if args.upload_to_s3:
         send_report_to_s3(obj)