First step to refactoring S3 reading logic (#53755)

Summary: This is an initial attempt in refactoring and consolidating our S3 read logic for print_test_stats.py, test_history.py, and run_test.py. This way, boto3 and botocore do not need to be imported in various places throughout the code base, and duplicated logic (such as the many type definitions) can exist in one place: `tools/stat_utils/s3_stat_parser.py`. walterddr contributed to this PR by moving print_test_stats.py to the tools folder and the corresponding tests a subfolder within tools. **NOTE: this removes those tests from CI as the new `tools/test/test_stats.py` is not in the test/ directory as the other tests in TESTS in run_test.py.** Pull Request resolved: https://github.com/pytorch/pytorch/pull/53755 Test Plan: This refactoring change should not break anything, so running the files as before should work as they did previously. To make sure that print_test_stats.py still functions: run `python tools/test/test_stats.py` and make sure all tests pass. To make sure that test_history.py works, run the example commands from `tools/test_history.py --help` and check that their output matches that shown. Note that the script will continue printing for a while, so don't be alarmed. Some next steps: - Actually coming up with similarities among the three current use cases and further refactoring/consolidating of functions (e.g., combining simplify and get_cases) - Moving more parsing logic to s3_stat_parser.py to have better abstraction between our files - Adding tests for s3_stat_parser.py when there is more functionality in it Reviewed By: agolynski, samestep Differential Revision: D27030285 Pulled By: janeyx99 fbshipit-source-id: e664781324ef7c0c30943bfd7f17c895075ef7a7
2025-12-06 12:20:52 +01:00 · 2021-03-17 12:30:21 -07:00 · 2021-03-17 12:30:21 -07:00 · 2e7311ef25
commit 2e7311ef25
parent ccdcfba5de
11 changed files with 808 additions and 886 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -636,7 +636,8 @@ jobs:
          export CIRCLE_JOB="$CIRCLE_JOB"
          export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
          cd workspace
-          python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          export PYTHONPATH="\${PWD}"
+          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
          EOL
          echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh
          unbuffer bash command.sh | ts
@ -800,8 +801,9 @@ jobs:
            export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_WIN_BUILD_V1}
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_WIN_BUILD_V1}
+            export PYTHONPATH="$PWD"
            pip install typing_extensions boto3
-            python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+            python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
          when: always
      - store_test_results:
          path: test/test-reports
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@ -198,7 +198,8 @@ jobs:
          export CIRCLE_JOB="$CIRCLE_JOB"
          export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
          cd workspace
-          python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+          export PYTHONPATH="\${PWD}"
+          python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
          EOL
          echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh
          unbuffer bash command.sh | ts
@ -362,8 +363,9 @@ jobs:
            export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_WIN_BUILD_V1}
            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_WIN_BUILD_V1}
+            export PYTHONPATH="$PWD"
            pip install typing_extensions boto3
-            python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test
+            python tools/print_test_stats.py --upload-to-s3 --compare-with-s3 test
          when: always
      - store_test_results:
          path: test/test-reports
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@ -34,13 +34,15 @@ warn_return_any = True
 implicit_reexport = False
 strict_equality = True

-files = tools/codegen/gen.py,
+files =
    tools/autograd/*.py,
+    tools/codegen/gen.py,
+    tools/print_test_stats.py,
    tools/pyi/*.py,
+    tools/stats_utils/*.py,
    tools/test_history.py,
    torch/testing/_internal/framework_utils.py,
    torch/testing/_internal/mypy_wrapper.py,
-    torch/testing/_internal/print_test_stats.py,
    torch/utils/benchmark/utils/common.py,
    torch/utils/benchmark/utils/timer.py,
    torch/utils/benchmark/utils/valgrind_wrapper/*.py,
--- a/mypy.ini
+++ b/mypy.ini
@ -34,8 +34,9 @@ files =
    test/test_type_hints.py,
    test/test_type_info.py,
    test/test_utils.py,
+    tools/clang_format_utils.py,
    tools/generate_torch_version.py,
-    tools/clang_format_utils.py
+    tools/stats_utils/*.py


 # Minimum version supported - variable annotations were introduced
--- a/test/run_test.py
+++ b/test/run_test.py
@ -22,11 +22,10 @@ from typing import Dict, Optional, Tuple, List, Any
 from typing_extensions import TypedDict

 try:
-    import boto3  # type: ignore[import]
-    import botocore  # type: ignore[import]
-    import botocore.exceptions  # type: ignore[import]
-    HAVE_BOTO3 = True
+    sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
+    from tools.stats_utils.s3_stat_parser import (get_S3_bucket_readonly, HAVE_BOTO3)
 except ImportError:
+    print("Unable to import s3_stat_parser from tools. Running without S3 stats...")
    HAVE_BOTO3 = False


@ -378,25 +377,19 @@ def get_test_time_reports_from_S3() -> List[Dict[str, Any]]:
    job = os.environ.get("CIRCLE_JOB", "")
    job_minus_shard_number = job.rstrip('0123456789')

-    try:
-        s3 = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED))
-        bucket = s3.Bucket(name="ossci-metrics")
-
-        reports = []
-        commit_index = 0
-        while len(reports) == 0 and commit_index < len(nightly_commits):
-            nightly_commit = nightly_commits[commit_index]
-            print(f'Grabbing reports from nightly commit: {nightly_commit}')
-            summaries = bucket.objects.filter(Prefix=f"test_time/{nightly_commit}/{job_minus_shard_number}")
-            for summary in summaries:
-                binary = summary.get()["Body"].read()
-                string = bz2.decompress(binary).decode("utf-8")
-                reports.append(json.loads(string))
-            commit_index += 1
-        return reports
-    except botocore.exceptions.ClientError as err:
-        print('Error Message: {}'.format(err.response['Error']['Message']))
-        return []
+    bucket = get_S3_bucket_readonly('ossci-metrics')
+    reports = []
+    commit_index = 0
+    while len(reports) == 0 and commit_index < len(nightly_commits):
+        nightly_commit = nightly_commits[commit_index]
+        print(f'Grabbing reports from nightly commit: {nightly_commit}')
+        summaries = bucket.objects.filter(Prefix=f"test_time/{nightly_commit}/{job_minus_shard_number}")
+        for summary in summaries:
+            binary = summary.get()["Body"].read()
+            string = bz2.decompress(binary).decode("utf-8")
+            reports.append(json.loads(string))
+        commit_index += 1
+    return reports


 def calculate_job_times(reports: List[Dict[str, Any]]) -> Dict[str, Tuple[float, int]]:
@ -431,7 +424,8 @@ def pull_job_times_from_S3() -> Dict[str, Tuple[float, int]]:
    if HAVE_BOTO3:
        s3_reports = get_test_time_reports_from_S3()
    else:
-        print('Please install boto3 to enable using S3 test times for automatic sharding and test categorization.')
+        print('Uh oh, boto3 is not found. Either it is not installed or we failed to import s3_stat_parser.')
+        print('If not installed, please install boto3 for automatic sharding and test categorization.')
        s3_reports = []

    if len(s3_reports) == 0:
--- a/test/test_testing.py
+++ b/test/test_testing.py
@ -10,7 +10,6 @@ from torch.testing._internal.framework_utils import calculate_shards
 from torch.testing._internal.common_device_type import \
    (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
 from torch.testing._internal import mypy_wrapper
-from torch.testing._internal import print_test_stats

 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@ -647,639 +646,6 @@ class TestMypyWrapper(TestCase):
        ))


-def fakehash(char):
-    return char * 40
-
-
-def dummy_meta_meta() -> print_test_stats.ReportMetaMeta:
-    return {
-        'build_pr': '',
-        'build_tag': '',
-        'build_sha1': '',
-        'build_branch': '',
-        'build_job': '',
-        'build_workflow_id': '',
-    }
-
-
-def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
-    return {
-        'name': name,
-        'seconds': seconds,
-        'errored': errored,
-        'failed': failed,
-        'skipped': skipped,
-    }
-
-
-def make_report_v1(tests) -> print_test_stats.Version1Report:
-    suites = {
-        suite_name: {
-            'total_seconds': sum(case['seconds'] for case in cases),
-            'cases': cases,
-        }
-        for suite_name, cases in tests.items()
-    }
-    return {
-        **dummy_meta_meta(),
-        'total_seconds': sum(s['total_seconds'] for s in suites.values()),
-        'suites': suites,
-    }
-
-
-def make_case_v2(seconds, status=None) -> print_test_stats.Version2Case:
-    return {
-        'seconds': seconds,
-        'status': status,
-    }
-
-
-def make_report_v2(tests) -> print_test_stats.Version2Report:
-    files = {}
-    for file_name, file_suites in tests.items():
-        suites = {
-            suite_name: {
-                'total_seconds': sum(case['seconds'] for case in cases.values()),
-                'cases': cases,
-            }
-            for suite_name, cases in file_suites.items()
-        }
-        files[file_name] = {
-            'suites': suites,
-            'total_seconds': sum(suite['total_seconds'] for suite in suites.values()),
-        }
-    return {
-        **dummy_meta_meta(),
-        'format_version': 2,
-        'total_seconds': sum(s['total_seconds'] for s in files.values()),
-        'files': files,
-    }
-
-
-class TestPrintTestStats(TestCase):
-    maxDiff = None
-
-    version1_report: print_test_stats.Version1Report = make_report_v1({
-        # input ordering of the suites is ignored
-        'Grault': [
-            # not printed: status same and time similar
-            makecase('test_grault0', 4.78, failed=True),
-            # status same, but time increased a lot
-            makecase('test_grault2', 1.473, errored=True),
-        ],
-        # individual tests times changed, not overall suite
-        'Qux': [
-            # input ordering of the test cases is ignored
-            makecase('test_qux1', 0.001, skipped=True),
-            makecase('test_qux6', 0.002, skipped=True),
-            # time in bounds, but status changed
-            makecase('test_qux4', 7.158, failed=True),
-            # not printed because it's the same as before
-            makecase('test_qux7', 0.003, skipped=True),
-            makecase('test_qux5', 11.968),
-            makecase('test_qux3', 23.496),
-        ],
-        # new test suite
-        'Bar': [
-            makecase('test_bar2', 3.742, failed=True),
-            makecase('test_bar1', 50.447),
-        ],
-        # overall suite time changed but no individual tests
-        'Norf': [
-            makecase('test_norf1', 3),
-            makecase('test_norf2', 3),
-            makecase('test_norf3', 3),
-            makecase('test_norf4', 3),
-        ],
-        # suite doesn't show up if it doesn't change enough
-        'Foo': [
-            makecase('test_foo1', 42),
-            makecase('test_foo2', 56),
-        ],
-    })
-
-    version2_report: print_test_stats.Version2Report = make_report_v2(
-        {
-            'test_a': {
-                'Grault': {
-                    'test_grault0': make_case_v2(4.78, 'failed'),
-                    'test_grault2': make_case_v2(1.473, 'errored'),
-                },
-                'Qux': {
-                    'test_qux1': make_case_v2(0.001, 'skipped'),
-                    'test_qux6': make_case_v2(0.002, 'skipped'),
-                    'test_qux4': make_case_v2(7.158, 'failed'),
-                    'test_qux7': make_case_v2(0.003, 'skipped'),
-                    'test_qux8': make_case_v2(11.968),
-                    'test_qux3': make_case_v2(23.496),
-                }
-            },
-            'test_b': {
-                'Bar': {
-                    'test_bar2': make_case_v2(3.742, 'failed'),
-                    'test_bar1': make_case_v2(50.447),
-                },
-                # overall suite time changed but no individual tests
-                'Norf': {
-                    'test_norf1': make_case_v2(3),
-                    'test_norf2': make_case_v2(3),
-                    'test_norf3': make_case_v2(3),
-                    'test_norf4': make_case_v2(3),
-                },
-            },
-            'test_c': {
-                'Foo': {
-                    'test_foo1': make_case_v2(42),
-                    'test_foo2': make_case_v2(56),
-                },
-            }
-        })
-
-    def test_simplify(self):
-        self.assertEqual(
-            {
-                '': {
-                    'Bar': {
-                        'test_bar1': {'seconds': 50.447, 'status': None},
-                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
-                    },
-                    'Foo': {
-                        'test_foo1': {'seconds': 42, 'status': None},
-                        'test_foo2': {'seconds': 56, 'status': None},
-                    },
-                    'Grault': {
-                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
-                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
-                    },
-                    'Norf': {
-                        'test_norf1': {'seconds': 3, 'status': None},
-                        'test_norf3': {'seconds': 3, 'status': None},
-                        'test_norf2': {'seconds': 3, 'status': None},
-                        'test_norf4': {'seconds': 3, 'status': None},
-                    },
-                    'Qux': {
-                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
-                        'test_qux3': {'seconds': 23.496, 'status': None},
-                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
-                        'test_qux5': {'seconds': 11.968, 'status': None},
-                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
-                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
-                    },
-                },
-            },
-            print_test_stats.simplify(self.version1_report)
-        )
-
-        self.assertEqual(
-            {
-                'test_a': {
-                    'Grault': {
-                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
-                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
-                    },
-                    'Qux': {
-                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
-                        'test_qux3': {'seconds': 23.496, 'status': None},
-                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
-                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
-                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
-                        'test_qux8': {'seconds': 11.968, 'status': None},
-                    },
-                },
-                'test_b': {
-                    'Bar': {
-                        'test_bar1': {'seconds': 50.447, 'status': None},
-                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
-                    },
-                    'Norf': {
-                        'test_norf1': {'seconds': 3, 'status': None},
-                        'test_norf2': {'seconds': 3, 'status': None},
-                        'test_norf3': {'seconds': 3, 'status': None},
-                        'test_norf4': {'seconds': 3, 'status': None},
-                    },
-                },
-                'test_c': {
-                    'Foo': {
-                        'test_foo1': {'seconds': 42, 'status': None},
-                        'test_foo2': {'seconds': 56, 'status': None},
-                    },
-                },
-            },
-            print_test_stats.simplify(self.version2_report),
-        )
-
-    def test_analysis(self):
-        head_report = self.version1_report
-
-        base_reports = {
-            # bbbb has no reports, so base is cccc instead
-            fakehash('b'): [],
-            fakehash('c'): [
-                make_report_v1({
-                    'Baz': [
-                        makecase('test_baz2', 13.605),
-                        # no recent suites have & skip this test
-                        makecase('test_baz1', 0.004, skipped=True),
-                    ],
-                    'Foo': [
-                        makecase('test_foo1', 43),
-                        # test added since dddd
-                        makecase('test_foo2', 57),
-                    ],
-                    'Grault': [
-                        makecase('test_grault0', 4.88, failed=True),
-                        makecase('test_grault1', 11.967, failed=True),
-                        makecase('test_grault2', 0.395, errored=True),
-                        makecase('test_grault3', 30.460),
-                    ],
-                    'Norf': [
-                        makecase('test_norf1', 2),
-                        makecase('test_norf2', 2),
-                        makecase('test_norf3', 2),
-                        makecase('test_norf4', 2),
-                    ],
-                    'Qux': [
-                        makecase('test_qux3', 4.978, errored=True),
-                        makecase('test_qux7', 0.002, skipped=True),
-                        makecase('test_qux2', 5.618),
-                        makecase('test_qux4', 7.766, errored=True),
-                        makecase('test_qux6', 23.589, failed=True),
-                    ],
-                }),
-            ],
-            fakehash('d'): [
-                make_report_v1({
-                    'Foo': [
-                        makecase('test_foo1', 40),
-                        # removed in cccc
-                        makecase('test_foo3', 17),
-                    ],
-                    'Baz': [
-                        # not skipped, so not included in stdev
-                        makecase('test_baz1', 3.14),
-                    ],
-                    'Qux': [
-                        makecase('test_qux7', 0.004, skipped=True),
-                        makecase('test_qux2', 6.02),
-                        makecase('test_qux4', 20.932),
-                    ],
-                    'Norf': [
-                        makecase('test_norf1', 3),
-                        makecase('test_norf2', 3),
-                        makecase('test_norf3', 3),
-                        makecase('test_norf4', 3),
-                    ],
-                    'Grault': [
-                        makecase('test_grault0', 5, failed=True),
-                        makecase('test_grault1', 14.325, failed=True),
-                        makecase('test_grault2', 0.31, errored=True),
-                    ],
-                }),
-            ],
-            fakehash('e'): [],
-            fakehash('f'): [
-                make_report_v1({
-                    'Foo': [
-                        makecase('test_foo3', 24),
-                        makecase('test_foo1', 43),
-                    ],
-                    'Baz': [
-                        makecase('test_baz2', 16.857),
-                    ],
-                    'Qux': [
-                        makecase('test_qux2', 6.422),
-                        makecase('test_qux4', 6.382, errored=True),
-                    ],
-                    'Norf': [
-                        makecase('test_norf1', 0.9),
-                        makecase('test_norf3', 0.9),
-                        makecase('test_norf2', 0.9),
-                        makecase('test_norf4', 0.9),
-                    ],
-                    'Grault': [
-                        makecase('test_grault0', 4.7, failed=True),
-                        makecase('test_grault1', 13.146, failed=True),
-                        makecase('test_grault2', 0.48, errored=True),
-                    ],
-                }),
-            ],
-        }
-
-        simpler_head = print_test_stats.simplify(head_report)
-        simpler_base = {}
-        for commit, reports in base_reports.items():
-            simpler_base[commit] = [print_test_stats.simplify(r) for r in reports]
-        analysis = print_test_stats.analyze(
-            head_report=simpler_head,
-            base_reports=simpler_base,
-        )
-
-        self.assertEqual(
-            '''\
-
- class Baz:
-     # was   15.23s ±   2.30s
-
-     def test_baz1: ...
-         # was   0.004s           (skipped)
-
-     def test_baz2: ...
-         # was  15.231s ±  2.300s
-
-
-  class Grault:
-      # was   48.86s ±   1.19s
-      # now    6.25s
-
-    - def test_grault1: ...
-    -     # was  13.146s ±  1.179s (failed)
-
-    - def test_grault3: ...
-    -     # was  30.460s
-
-
-  class Qux:
-      # was   41.66s ±   1.06s
-      # now   42.63s
-
-    - def test_qux2: ...
-    -     # was   6.020s ±  0.402s
-
-    ! def test_qux3: ...
-    !     # was   4.978s           (errored)
-    !     # now  23.496s
-
-    ! def test_qux4: ...
-    !     # was   7.074s ±  0.979s (errored)
-    !     # now   7.158s           (failed)
-
-    ! def test_qux6: ...
-    !     # was  23.589s           (failed)
-    !     # now   0.002s           (skipped)
-
-    + def test_qux1: ...
-    +     # now   0.001s           (skipped)
-
-    + def test_qux5: ...
-    +     # now  11.968s
-
-
-+ class Bar:
-+     # now   54.19s
-+
-+     def test_bar1: ...
-+         # now  50.447s
-+
-+     def test_bar2: ...
-+         # now   3.742s           (failed)
-
-''',
-            print_test_stats.anomalies(analysis),
-        )
-
-    def test_graph(self):
-        # HEAD is on master
-        self.assertEqual(
-            '''\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    * aaaaaaaaaa (HEAD)              total time   502.99s
-    * bbbbbbbbbb (base)   1 report,  total time    47.84s
-    * cccccccccc          1 report,  total time   332.50s
-    * dddddddddd          0 reports
-    |
-    :
-''',
-            print_test_stats.graph(
-                head_sha=fakehash('a'),
-                head_seconds=502.99,
-                base_seconds={
-                    fakehash('b'): [47.84],
-                    fakehash('c'): [332.50],
-                    fakehash('d'): [],
-                },
-                on_master=True,
-            )
-        )
-
-        self.assertEqual(
-            '''\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time  9988.77s
-    |/
-    * bbbbbbbbbb (base) 121 reports, total time  7654.32s ±   55.55s
-    * cccccccccc         20 reports, total time  5555.55s ±  253.19s
-    * dddddddddd          1 report,  total time  1234.56s
-    |
-    :
-''',
-            print_test_stats.graph(
-                head_sha=fakehash('a'),
-                head_seconds=9988.77,
-                base_seconds={
-                    fakehash('b'): [7598.77] * 60 + [7654.32] + [7709.87] * 60,
-                    fakehash('c'): [5308.77] * 10 + [5802.33] * 10,
-                    fakehash('d'): [1234.56],
-                },
-                on_master=False,
-            )
-        )
-
-        self.assertEqual(
-            '''\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time    25.52s
-    | |
-    | : (5 commits)
-    |/
-    * bbbbbbbbbb          0 reports
-    * cccccccccc          0 reports
-    * dddddddddd (base)  15 reports, total time    58.92s ±   25.82s
-    |
-    :
-''',
-            print_test_stats.graph(
-                head_sha=fakehash('a'),
-                head_seconds=25.52,
-                base_seconds={
-                    fakehash('b'): [],
-                    fakehash('c'): [],
-                    fakehash('d'): [52.25] * 14 + [152.26],
-                },
-                on_master=False,
-                ancestry_path=5,
-            )
-        )
-
-        self.assertEqual(
-            '''\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     0.08s
-    |/|
-    | : (1 commit)
-    |
-    * bbbbbbbbbb          0 reports
-    * cccccccccc (base)   1 report,  total time     0.09s
-    * dddddddddd          3 reports, total time     0.10s ±    0.05s
-    |
-    :
-''',
-            print_test_stats.graph(
-                head_sha=fakehash('a'),
-                head_seconds=0.08,
-                base_seconds={
-                    fakehash('b'): [],
-                    fakehash('c'): [0.09],
-                    fakehash('d'): [0.05, 0.10, 0.15],
-                },
-                on_master=False,
-                other_ancestors=1,
-            )
-        )
-
-        self.assertEqual(
-            '''\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     5.98s
-    | |
-    | : (1 commit)
-    |/|
-    | : (7 commits)
-    |
-    * bbbbbbbbbb (base)   2 reports, total time     6.02s ±    1.71s
-    * cccccccccc          0 reports
-    * dddddddddd         10 reports, total time     5.84s ±    0.92s
-    |
-    :
-''',
-            print_test_stats.graph(
-                head_sha=fakehash('a'),
-                head_seconds=5.98,
-                base_seconds={
-                    fakehash('b'): [4.81, 7.23],
-                    fakehash('c'): [],
-                    fakehash('d'): [4.97] * 5 + [6.71] * 5,
-                },
-                on_master=False,
-                ancestry_path=1,
-                other_ancestors=7,
-            )
-        )
-
-    def test_regression_info(self):
-        self.assertEqual(
-            '''\
----- Historic stats comparison result ------
-
-    job: foo_job
-    commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     3.02s
-    |/
-    * bbbbbbbbbb (base)   1 report,  total time    41.00s
-    * cccccccccc          1 report,  total time    43.00s
-    |
-    :
-
-Removed  (across    1 suite)      1 test,  totaling -   1.00s
-Modified (across    1 suite)      1 test,  totaling -  41.48s ±   2.12s
-Added    (across    1 suite)      1 test,  totaling +   3.00s
-''',
-            print_test_stats.regression_info(
-                head_sha=fakehash('a'),
-                head_report=make_report_v1({
-                    'Foo': [
-                        makecase('test_foo', 0.02, skipped=True),
-                        makecase('test_baz', 3),
-                    ]}),
-                base_reports={
-                    fakehash('b'): [
-                        make_report_v1({
-                            'Foo': [
-                                makecase('test_foo', 40),
-                                makecase('test_bar', 1),
-                            ],
-                        }),
-                    ],
-                    fakehash('c'): [
-                        make_report_v1({
-                            'Foo': [
-                                makecase('test_foo', 43),
-                            ],
-                        }),
-                    ],
-                },
-                job_name='foo_job',
-                on_master=False,
-                ancestry_path=0,
-                other_ancestors=0,
-            )
-        )
-
-    def test_regression_info_new_job(self):
-        self.assertEqual(
-            '''\
----- Historic stats comparison result ------
-
-    job: foo_job
-    commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     3.02s
-    | |
-    | : (3 commits)
-    |/|
-    | : (2 commits)
-    |
-    * bbbbbbbbbb          0 reports
-    * cccccccccc          0 reports
-    |
-    :
-
-Removed  (across    0 suites)     0 tests, totaling     0.00s
-Modified (across    0 suites)     0 tests, totaling     0.00s
-Added    (across    1 suite)      2 tests, totaling +   3.02s
-''',
-            print_test_stats.regression_info(
-                head_sha=fakehash('a'),
-                head_report=make_report_v1({
-                    'Foo': [
-                        makecase('test_foo', 0.02, skipped=True),
-                        makecase('test_baz', 3),
-                    ]}),
-                base_reports={
-                    fakehash('b'): [],
-                    fakehash('c'): [],
-                },
-                job_name='foo_job',
-                on_master=False,
-                ancestry_path=3,
-                other_ancestors=2,
-            )
-        )
-
-
 class TestFrameworkUtils(TestCase):
    tests = [
        'super_long_test',
--- a/torch/testing/_internal/print_test_stats.py
+++ b/torch/testing/_internal/print_test_stats.py
@ -13,82 +13,16 @@ from collections import defaultdict
 from glob import glob
 from pathlib import Path
 from typing import (Any, DefaultDict, Dict, Iterable, Iterator, List, Optional,
-                    Set, Tuple, Union, cast)
+                    Set, Tuple, cast)
 from xml.dom import minidom  # type: ignore[import]

 import requests
-from typing_extensions import Literal, TypedDict
-
-try:
-    import boto3  # type: ignore[import]
-    HAVE_BOTO3 = True
-except ImportError:
-    HAVE_BOTO3 = False
-
-# TODO: consolidate these typedefs with the identical ones in
-# tools/test_history.py
-
-Commit = str  # 40-digit SHA-1 hex string
-Status = Optional[Literal['errored', 'failed', 'skipped']]
+from typing_extensions import TypedDict
+from tools.stats_utils.s3_stat_parser import (newify_case, get_S3_object_from_bucket, get_S3_bucket_readonly,
+                                              Report, Status, Commit, HAVE_BOTO3, Version2Case, VersionedReport,
+                                              Version1Report, Version2Report, ReportMetaMeta)


-class CaseMeta(TypedDict):
-    seconds: float
-
-
-class Version1Case(CaseMeta):
-    name: str
-    errored: bool
-    failed: bool
-    skipped: bool
-
-
-class Version1Suite(TypedDict):
-    total_seconds: float
-    cases: List[Version1Case]
-
-
-class ReportMetaMeta(TypedDict):
-    build_pr: str
-    build_tag: str
-    build_sha1: Commit
-    build_branch: str
-    build_job: str
-    build_workflow_id: str
-
-
-class ReportMeta(ReportMetaMeta):
-    total_seconds: float
-
-
-class Version1Report(ReportMeta):
-    suites: Dict[str, Version1Suite]
-
-
-class Version2Case(CaseMeta):
-    status: Status
-
-
-class Version2Suite(TypedDict):
-    total_seconds: float
-    cases: Dict[str, Version2Case]
-
-
-class Version2File(TypedDict):
-    total_seconds: float
-    suites: Dict[str, Version2Suite]
-
-
-class VersionedReport(ReportMeta):
-    format_version: int
-
-
-# report: Version2Report implies report['format_version'] == 2
-class Version2Report(VersionedReport):
-    files: Dict[str, Version2File]
-
-
-Report = Union[Version1Report, VersionedReport]

 SimplerSuite = Dict[str, Version2Case]
 SimplerFile = Dict[str, SimplerSuite]
@ -115,24 +49,6 @@ class SuiteDiff(TypedDict):
    cases: List[CaseDiff]


-# TODO: consolidate this with the case_status function from
-# tools/test_history.py
-def case_status(case: Version1Case) -> Status:
-    for k in {'errored', 'failed', 'skipped'}:
-        if case[k]:  # type: ignore[misc]
-            return cast(Status, k)
-    return None
-
-
-# TODO: consolidate this with the newify_case function from
-# tools/test_history.py
-def newify_case(case: Version1Case) -> Version2Case:
-    return {
-        'seconds': case['seconds'],
-        'status': case_status(case),
-    }
-
-
 # TODO: consolidate this with the get_cases function from
 # tools/test_history.py

@ -848,8 +764,7 @@ def send_report_to_s3(head_report: Version2Report) -> None:
        return
    now = datetime.datetime.utcnow().isoformat()
    key = f'test_time/{sha1}/{job}/{now}Z.json.bz2'  # Z meaning UTC
-    s3 = boto3.resource('s3')
-    obj = s3.Object('ossci-metrics', key)
+    obj = get_S3_object_from_bucket('ossci-metrics', key)
    # use bz2 because the results are smaller than gzip, and the
    # compression time penalty we pay is only about half a second for
    # input files of a few megabytes in size like these JSON files, and
@ -890,8 +805,7 @@ def print_regressions(head_report: Report, *, num_prev_commits: int) -> None:
        commits = commits[:-1]

    job = os.environ.get("CIRCLE_JOB", "")
-    s3 = boto3.resource("s3")
-    bucket = s3.Bucket(name="ossci-metrics")
+    bucket = get_S3_bucket_readonly('ossci-metrics')
    index = {}
    for commit in commits:
        summaries = bucket.objects.filter(Prefix=f"test_time/{commit}/{job}/")
--- a/tools/stats_utils/init.py
+++ b/tools/stats_utils/init.py
--- a/tools/stats_utils/s3_stat_parser.py
+++ b/tools/stats_utils/s3_stat_parser.py
@ -0,0 +1,129 @@
+from typing import Dict, List, Optional, Union, Any, cast
+from typing_extensions import Literal, TypedDict
+
+try:
+    import boto3  # type: ignore[import]
+    import botocore  # type: ignore[import]
+    HAVE_BOTO3 = True
+except ImportError:
+    HAVE_BOTO3 = False
+
+
+Commit = str  # 40-digit SHA-1 hex string
+Status = Optional[Literal['errored', 'failed', 'skipped']]
+
+
+class CaseMeta(TypedDict):
+    seconds: float
+
+
+class Version1Case(CaseMeta):
+    name: str
+    errored: bool
+    failed: bool
+    skipped: bool
+
+
+class Version1Suite(TypedDict):
+    total_seconds: float
+    cases: List[Version1Case]
+
+
+class ReportMetaMeta(TypedDict):
+    build_pr: str
+    build_tag: str
+    build_sha1: Commit
+    build_branch: str
+    build_job: str
+    build_workflow_id: str
+
+
+class ReportMeta(ReportMetaMeta):
+    total_seconds: float
+
+
+class Version1Report(ReportMeta):
+    suites: Dict[str, Version1Suite]
+
+
+class Version2Case(CaseMeta):
+    status: Status
+
+
+class Version2Suite(TypedDict):
+    total_seconds: float
+    cases: Dict[str, Version2Case]
+
+
+class Version2File(TypedDict):
+    total_seconds: float
+    suites: Dict[str, Version2Suite]
+
+
+class VersionedReport(ReportMeta):
+    format_version: int
+
+
+# report: Version2Report implies report['format_version'] == 2
+class Version2Report(VersionedReport):
+    files: Dict[str, Version2File]
+
+
+Report = Union[Version1Report, VersionedReport]
+
+
+def get_S3_bucket_readonly(bucket_name: str) -> Any:
+    s3 = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED))
+    return s3.Bucket(bucket_name)
+
+
+def get_S3_object_from_bucket(bucket_name: str, object: str) -> Any:
+    s3 = boto3.resource('s3')
+    return s3.Object(bucket_name, object)
+
+
+def case_status(case: Version1Case) -> Status:
+    for k in {'errored', 'failed', 'skipped'}:
+        if case[k]:  # type: ignore[misc]
+            return cast(Status, k)
+    return None
+
+
+def newify_case(case: Version1Case) -> Version2Case:
+    return {
+        'seconds': case['seconds'],
+        'status': case_status(case),
+    }
+
+
+def get_cases(
+    *,
+    data: Report,
+    filename: Optional[str],
+    suite_name: Optional[str],
+    test_name: str,
+) -> List[Version2Case]:
+    cases: List[Version2Case] = []
+    if 'format_version' not in data:  # version 1 implicitly
+        v1report = cast(Version1Report, data)
+        suites = v1report['suites']
+        for sname, v1suite in suites.items():
+            if sname == suite_name or not suite_name:
+                for v1case in v1suite['cases']:
+                    if v1case['name'] == test_name:
+                        cases.append(newify_case(v1case))
+    else:
+        v_report = cast(VersionedReport, data)
+        version = v_report['format_version']
+        if version == 2:
+            v2report = cast(Version2Report, v_report)
+            for fname, v2file in v2report['files'].items():
+                if fname == filename or not filename:
+                    for sname, v2suite in v2file['suites'].items():
+                        if sname == suite_name or not suite_name:
+                            v2case = v2suite['cases'].get(test_name)
+                            if v2case:
+                                cases.append(v2case)
+        else:
+            raise RuntimeError(f'Unknown format version: {version}')
+    return cases
--- a/tools/test/test_stats.py
+++ b/tools/test/test_stats.py
@ -0,0 +1,637 @@
+import unittest
+from tools import print_test_stats
+
+
+def fakehash(char):
+    return char * 40
+
+
+def dummy_meta_meta() -> print_test_stats.ReportMetaMeta:
+    return {
+        'build_pr': '',
+        'build_tag': '',
+        'build_sha1': '',
+        'build_branch': '',
+        'build_job': '',
+        'build_workflow_id': '',
+    }
+
+
+def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
+    return {
+        'name': name,
+        'seconds': seconds,
+        'errored': errored,
+        'failed': failed,
+        'skipped': skipped,
+    }
+
+
+def make_report_v1(tests) -> print_test_stats.Version1Report:
+    suites = {
+        suite_name: {
+            'total_seconds': sum(case['seconds'] for case in cases),
+            'cases': cases,
+        }
+        for suite_name, cases in tests.items()
+    }
+    return {
+        **dummy_meta_meta(),
+        'total_seconds': sum(s['total_seconds'] for s in suites.values()),
+        'suites': suites,
+    }
+
+
+def make_case_v2(seconds, status=None) -> print_test_stats.Version2Case:
+    return {
+        'seconds': seconds,
+        'status': status,
+    }
+
+
+def make_report_v2(tests) -> print_test_stats.Version2Report:
+    files = {}
+    for file_name, file_suites in tests.items():
+        suites = {
+            suite_name: {
+                'total_seconds': sum(case['seconds'] for case in cases.values()),
+                'cases': cases,
+            }
+            for suite_name, cases in file_suites.items()
+        }
+        files[file_name] = {
+            'suites': suites,
+            'total_seconds': sum(suite['total_seconds'] for suite in suites.values()),
+        }
+    return {
+        **dummy_meta_meta(),
+        'format_version': 2,
+        'total_seconds': sum(s['total_seconds'] for s in files.values()),
+        'files': files,
+    }
+maxDiff = None
+
+class TestPrintTestStats(unittest.TestCase):
+    version1_report: print_test_stats.Version1Report = make_report_v1({
+        # input ordering of the suites is ignored
+        'Grault': [
+            # not printed: status same and time similar
+            makecase('test_grault0', 4.78, failed=True),
+            # status same, but time increased a lot
+            makecase('test_grault2', 1.473, errored=True),
+        ],
+        # individual tests times changed, not overall suite
+        'Qux': [
+            # input ordering of the test cases is ignored
+            makecase('test_qux1', 0.001, skipped=True),
+            makecase('test_qux6', 0.002, skipped=True),
+            # time in bounds, but status changed
+            makecase('test_qux4', 7.158, failed=True),
+            # not printed because it's the same as before
+            makecase('test_qux7', 0.003, skipped=True),
+            makecase('test_qux5', 11.968),
+            makecase('test_qux3', 23.496),
+        ],
+        # new test suite
+        'Bar': [
+            makecase('test_bar2', 3.742, failed=True),
+            makecase('test_bar1', 50.447),
+        ],
+        # overall suite time changed but no individual tests
+        'Norf': [
+            makecase('test_norf1', 3),
+            makecase('test_norf2', 3),
+            makecase('test_norf3', 3),
+            makecase('test_norf4', 3),
+        ],
+        # suite doesn't show up if it doesn't change enough
+        'Foo': [
+            makecase('test_foo1', 42),
+            makecase('test_foo2', 56),
+        ],
+    })
+
+    version2_report: print_test_stats.Version2Report = make_report_v2(
+        {
+            'test_a': {
+                'Grault': {
+                    'test_grault0': make_case_v2(4.78, 'failed'),
+                    'test_grault2': make_case_v2(1.473, 'errored'),
+                },
+                'Qux': {
+                    'test_qux1': make_case_v2(0.001, 'skipped'),
+                    'test_qux6': make_case_v2(0.002, 'skipped'),
+                    'test_qux4': make_case_v2(7.158, 'failed'),
+                    'test_qux7': make_case_v2(0.003, 'skipped'),
+                    'test_qux8': make_case_v2(11.968),
+                    'test_qux3': make_case_v2(23.496),
+                }
+            },
+            'test_b': {
+                'Bar': {
+                    'test_bar2': make_case_v2(3.742, 'failed'),
+                    'test_bar1': make_case_v2(50.447),
+                },
+                # overall suite time changed but no individual tests
+                'Norf': {
+                    'test_norf1': make_case_v2(3),
+                    'test_norf2': make_case_v2(3),
+                    'test_norf3': make_case_v2(3),
+                    'test_norf4': make_case_v2(3),
+                },
+            },
+            'test_c': {
+                'Foo': {
+                    'test_foo1': make_case_v2(42),
+                    'test_foo2': make_case_v2(56),
+                },
+            }
+        })
+
+    def test_simplify(self):
+        self.assertEqual(
+            {
+                '': {
+                    'Bar': {
+                        'test_bar1': {'seconds': 50.447, 'status': None},
+                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                    },
+                    'Foo': {
+                        'test_foo1': {'seconds': 42, 'status': None},
+                        'test_foo2': {'seconds': 56, 'status': None},
+                    },
+                    'Grault': {
+                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
+                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    },
+                    'Norf': {
+                        'test_norf1': {'seconds': 3, 'status': None},
+                        'test_norf3': {'seconds': 3, 'status': None},
+                        'test_norf2': {'seconds': 3, 'status': None},
+                        'test_norf4': {'seconds': 3, 'status': None},
+                    },
+                    'Qux': {
+                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
+                        'test_qux3': {'seconds': 23.496, 'status': None},
+                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
+                        'test_qux5': {'seconds': 11.968, 'status': None},
+                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
+                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                    },
+                },
+            },
+            print_test_stats.simplify(self.version1_report)
+        )
+
+        self.assertEqual(
+            {
+                'test_a': {
+                    'Grault': {
+                        'test_grault0': {'seconds': 4.78, 'status': 'failed'},
+                        'test_grault2': {'seconds': 1.473, 'status': 'errored'},
+                    },
+                    'Qux': {
+                        'test_qux1': {'seconds': 0.001, 'status': 'skipped'},
+                        'test_qux3': {'seconds': 23.496, 'status': None},
+                        'test_qux4': {'seconds': 7.158, 'status': 'failed'},
+                        'test_qux6': {'seconds': 0.002, 'status': 'skipped'},
+                        'test_qux7': {'seconds': 0.003, 'status': 'skipped'},
+                        'test_qux8': {'seconds': 11.968, 'status': None},
+                    },
+                },
+                'test_b': {
+                    'Bar': {
+                        'test_bar1': {'seconds': 50.447, 'status': None},
+                        'test_bar2': {'seconds': 3.742, 'status': 'failed'},
+                    },
+                    'Norf': {
+                        'test_norf1': {'seconds': 3, 'status': None},
+                        'test_norf2': {'seconds': 3, 'status': None},
+                        'test_norf3': {'seconds': 3, 'status': None},
+                        'test_norf4': {'seconds': 3, 'status': None},
+                    },
+                },
+                'test_c': {
+                    'Foo': {
+                        'test_foo1': {'seconds': 42, 'status': None},
+                        'test_foo2': {'seconds': 56, 'status': None},
+                    },
+                },
+            },
+            print_test_stats.simplify(self.version2_report),
+        )
+
+    def test_analysis(self):
+        head_report = self.version1_report
+
+        base_reports = {
+            # bbbb has no reports, so base is cccc instead
+            fakehash('b'): [],
+            fakehash('c'): [
+                make_report_v1({
+                    'Baz': [
+                        makecase('test_baz2', 13.605),
+                        # no recent suites have & skip this test
+                        makecase('test_baz1', 0.004, skipped=True),
+                    ],
+                    'Foo': [
+                        makecase('test_foo1', 43),
+                        # test added since dddd
+                        makecase('test_foo2', 57),
+                    ],
+                    'Grault': [
+                        makecase('test_grault0', 4.88, failed=True),
+                        makecase('test_grault1', 11.967, failed=True),
+                        makecase('test_grault2', 0.395, errored=True),
+                        makecase('test_grault3', 30.460),
+                    ],
+                    'Norf': [
+                        makecase('test_norf1', 2),
+                        makecase('test_norf2', 2),
+                        makecase('test_norf3', 2),
+                        makecase('test_norf4', 2),
+                    ],
+                    'Qux': [
+                        makecase('test_qux3', 4.978, errored=True),
+                        makecase('test_qux7', 0.002, skipped=True),
+                        makecase('test_qux2', 5.618),
+                        makecase('test_qux4', 7.766, errored=True),
+                        makecase('test_qux6', 23.589, failed=True),
+                    ],
+                }),
+            ],
+            fakehash('d'): [
+                make_report_v1({
+                    'Foo': [
+                        makecase('test_foo1', 40),
+                        # removed in cccc
+                        makecase('test_foo3', 17),
+                    ],
+                    'Baz': [
+                        # not skipped, so not included in stdev
+                        makecase('test_baz1', 3.14),
+                    ],
+                    'Qux': [
+                        makecase('test_qux7', 0.004, skipped=True),
+                        makecase('test_qux2', 6.02),
+                        makecase('test_qux4', 20.932),
+                    ],
+                    'Norf': [
+                        makecase('test_norf1', 3),
+                        makecase('test_norf2', 3),
+                        makecase('test_norf3', 3),
+                        makecase('test_norf4', 3),
+                    ],
+                    'Grault': [
+                        makecase('test_grault0', 5, failed=True),
+                        makecase('test_grault1', 14.325, failed=True),
+                        makecase('test_grault2', 0.31, errored=True),
+                    ],
+                }),
+            ],
+            fakehash('e'): [],
+            fakehash('f'): [
+                make_report_v1({
+                    'Foo': [
+                        makecase('test_foo3', 24),
+                        makecase('test_foo1', 43),
+                    ],
+                    'Baz': [
+                        makecase('test_baz2', 16.857),
+                    ],
+                    'Qux': [
+                        makecase('test_qux2', 6.422),
+                        makecase('test_qux4', 6.382, errored=True),
+                    ],
+                    'Norf': [
+                        makecase('test_norf1', 0.9),
+                        makecase('test_norf3', 0.9),
+                        makecase('test_norf2', 0.9),
+                        makecase('test_norf4', 0.9),
+                    ],
+                    'Grault': [
+                        makecase('test_grault0', 4.7, failed=True),
+                        makecase('test_grault1', 13.146, failed=True),
+                        makecase('test_grault2', 0.48, errored=True),
+                    ],
+                }),
+            ],
+        }
+
+        simpler_head = print_test_stats.simplify(head_report)
+        simpler_base = {}
+        for commit, reports in base_reports.items():
+            simpler_base[commit] = [print_test_stats.simplify(r) for r in reports]
+        analysis = print_test_stats.analyze(
+            head_report=simpler_head,
+            base_reports=simpler_base,
+        )
+
+        self.assertEqual(
+            '''\
+
+- class Baz:
+-     # was   15.23s ±   2.30s
+-
+-     def test_baz1: ...
+-         # was   0.004s           (skipped)
+-
+-     def test_baz2: ...
+-         # was  15.231s ±  2.300s
+
+
+  class Grault:
+      # was   48.86s ±   1.19s
+      # now    6.25s
+
+    - def test_grault1: ...
+    -     # was  13.146s ±  1.179s (failed)
+
+    - def test_grault3: ...
+    -     # was  30.460s
+
+
+  class Qux:
+      # was   41.66s ±   1.06s
+      # now   42.63s
+
+    - def test_qux2: ...
+    -     # was   6.020s ±  0.402s
+
+    ! def test_qux3: ...
+    !     # was   4.978s           (errored)
+    !     # now  23.496s
+
+    ! def test_qux4: ...
+    !     # was   7.074s ±  0.979s (errored)
+    !     # now   7.158s           (failed)
+
+    ! def test_qux6: ...
+    !     # was  23.589s           (failed)
+    !     # now   0.002s           (skipped)
+
+    + def test_qux1: ...
+    +     # now   0.001s           (skipped)
+
+    + def test_qux5: ...
+    +     # now  11.968s
+
+
+ class Bar:
+     # now   54.19s
+
+     def test_bar1: ...
+         # now  50.447s
+
+     def test_bar2: ...
+         # now   3.742s           (failed)
+
+''',
+            print_test_stats.anomalies(analysis),
+        )
+
+    def test_graph(self):
+        # HEAD is on master
+        self.assertEqual(
+            '''\
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    * aaaaaaaaaa (HEAD)              total time   502.99s
+    * bbbbbbbbbb (base)   1 report,  total time    47.84s
+    * cccccccccc          1 report,  total time   332.50s
+    * dddddddddd          0 reports
+    |
+    :
+''',
+            print_test_stats.graph(
+                head_sha=fakehash('a'),
+                head_seconds=502.99,
+                base_seconds={
+                    fakehash('b'): [47.84],
+                    fakehash('c'): [332.50],
+                    fakehash('d'): [],
+                },
+                on_master=True,
+            )
+        )
+
+        self.assertEqual(
+            '''\
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    | * aaaaaaaaaa (HEAD)            total time  9988.77s
+    |/
+    * bbbbbbbbbb (base) 121 reports, total time  7654.32s ±   55.55s
+    * cccccccccc         20 reports, total time  5555.55s ±  253.19s
+    * dddddddddd          1 report,  total time  1234.56s
+    |
+    :
+''',
+            print_test_stats.graph(
+                head_sha=fakehash('a'),
+                head_seconds=9988.77,
+                base_seconds={
+                    fakehash('b'): [7598.77] * 60 + [7654.32] + [7709.87] * 60,
+                    fakehash('c'): [5308.77] * 10 + [5802.33] * 10,
+                    fakehash('d'): [1234.56],
+                },
+                on_master=False,
+            )
+        )
+
+        self.assertEqual(
+            '''\
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    | * aaaaaaaaaa (HEAD)            total time    25.52s
+    | |
+    | : (5 commits)
+    |/
+    * bbbbbbbbbb          0 reports
+    * cccccccccc          0 reports
+    * dddddddddd (base)  15 reports, total time    58.92s ±   25.82s
+    |
+    :
+''',
+            print_test_stats.graph(
+                head_sha=fakehash('a'),
+                head_seconds=25.52,
+                base_seconds={
+                    fakehash('b'): [],
+                    fakehash('c'): [],
+                    fakehash('d'): [52.25] * 14 + [152.26],
+                },
+                on_master=False,
+                ancestry_path=5,
+            )
+        )
+
+        self.assertEqual(
+            '''\
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    | * aaaaaaaaaa (HEAD)            total time     0.08s
+    |/|
+    | : (1 commit)
+    |
+    * bbbbbbbbbb          0 reports
+    * cccccccccc (base)   1 report,  total time     0.09s
+    * dddddddddd          3 reports, total time     0.10s ±    0.05s
+    |
+    :
+''',
+            print_test_stats.graph(
+                head_sha=fakehash('a'),
+                head_seconds=0.08,
+                base_seconds={
+                    fakehash('b'): [],
+                    fakehash('c'): [0.09],
+                    fakehash('d'): [0.05, 0.10, 0.15],
+                },
+                on_master=False,
+                other_ancestors=1,
+            )
+        )
+
+        self.assertEqual(
+            '''\
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    | * aaaaaaaaaa (HEAD)            total time     5.98s
+    | |
+    | : (1 commit)
+    |/|
+    | : (7 commits)
+    |
+    * bbbbbbbbbb (base)   2 reports, total time     6.02s ±    1.71s
+    * cccccccccc          0 reports
+    * dddddddddd         10 reports, total time     5.84s ±    0.92s
+    |
+    :
+''',
+            print_test_stats.graph(
+                head_sha=fakehash('a'),
+                head_seconds=5.98,
+                base_seconds={
+                    fakehash('b'): [4.81, 7.23],
+                    fakehash('c'): [],
+                    fakehash('d'): [4.97] * 5 + [6.71] * 5,
+                },
+                on_master=False,
+                ancestry_path=1,
+                other_ancestors=7,
+            )
+        )
+
+    def test_regression_info(self):
+        self.assertEqual(
+            '''\
+----- Historic stats comparison result ------
+
+    job: foo_job
+    commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    | * aaaaaaaaaa (HEAD)            total time     3.02s
+    |/
+    * bbbbbbbbbb (base)   1 report,  total time    41.00s
+    * cccccccccc          1 report,  total time    43.00s
+    |
+    :
+
+Removed  (across    1 suite)      1 test,  totaling -   1.00s
+Modified (across    1 suite)      1 test,  totaling -  41.48s ±   2.12s
+Added    (across    1 suite)      1 test,  totaling +   3.00s
+''',
+            print_test_stats.regression_info(
+                head_sha=fakehash('a'),
+                head_report=make_report_v1({
+                    'Foo': [
+                        makecase('test_foo', 0.02, skipped=True),
+                        makecase('test_baz', 3),
+                    ]}),
+                base_reports={
+                    fakehash('b'): [
+                        make_report_v1({
+                            'Foo': [
+                                makecase('test_foo', 40),
+                                makecase('test_bar', 1),
+                            ],
+                        }),
+                    ],
+                    fakehash('c'): [
+                        make_report_v1({
+                            'Foo': [
+                                makecase('test_foo', 43),
+                            ],
+                        }),
+                    ],
+                },
+                job_name='foo_job',
+                on_master=False,
+                ancestry_path=0,
+                other_ancestors=0,
+            )
+        )
+
+    def test_regression_info_new_job(self):
+        self.assertEqual(
+            '''\
+----- Historic stats comparison result ------
+
+    job: foo_job
+    commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+Commit graph (base is most recent master ancestor with at least one S3 report):
+
+    : (master)
+    |
+    | * aaaaaaaaaa (HEAD)            total time     3.02s
+    | |
+    | : (3 commits)
+    |/|
+    | : (2 commits)
+    |
+    * bbbbbbbbbb          0 reports
+    * cccccccccc          0 reports
+    |
+    :
+
+Removed  (across    0 suites)     0 tests, totaling     0.00s
+Modified (across    0 suites)     0 tests, totaling     0.00s
+Added    (across    1 suite)      2 tests, totaling +   3.02s
+''',
+            print_test_stats.regression_info(
+                head_sha=fakehash('a'),
+                head_report=make_report_v1({
+                    'Foo': [
+                        makecase('test_foo', 0.02, skipped=True),
+                        makecase('test_baz', 3),
+                    ]}),
+                base_reports={
+                    fakehash('b'): [],
+                    fakehash('c'): [],
+                },
+                job_name='foo_job',
+                on_master=False,
+                ancestry_path=3,
+                other_ancestors=2,
+            )
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/test_history.py
+++ b/tools/test_history.py
@ -6,11 +6,8 @@ import json
 import subprocess
 from collections import defaultdict
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast
-
-import boto3  # type: ignore[import]
-import botocore  # type: ignore[import]
-from typing_extensions import Literal, TypedDict
+from typing import Any, Dict, List, Optional, Set, Tuple
+from tools.stats_utils.s3_stat_parser import (get_S3_bucket_readonly, get_cases, Report)


 def get_git_commit_history(
@ -35,73 +32,6 @@ def get_object_summaries(*, bucket: Any, sha: str) -> Dict[str, List[Any]]:
        by_job[job].append(summary)
    return dict(by_job)

-
-# TODO: consolidate these typedefs with the identical ones in
-# torch/testing/_internal/print_test_stats.py
-
-Commit = str  # 40-digit SHA-1 hex string
-Status = Optional[Literal['errored', 'failed', 'skipped']]
-
-
-class CaseMeta(TypedDict):
-    seconds: float
-
-
-class Version1Case(CaseMeta):
-    name: str
-    errored: bool
-    failed: bool
-    skipped: bool
-
-
-class Version1Suite(TypedDict):
-    total_seconds: float
-    cases: List[Version1Case]
-
-
-class ReportMetaMeta(TypedDict):
-    build_pr: str
-    build_tag: str
-    build_sha1: Commit
-    build_branch: str
-    build_job: str
-    build_workflow_id: str
-
-
-class ReportMeta(ReportMetaMeta):
-    total_seconds: float
-
-
-class Version1Report(ReportMeta):
-    suites: Dict[str, Version1Suite]
-
-
-class Version2Case(CaseMeta):
-    status: Status
-
-
-class Version2Suite(TypedDict):
-    total_seconds: float
-    cases: Dict[str, Version2Case]
-
-
-class Version2File(TypedDict):
-    total_seconds: float
-    suites: Dict[str, Version2Suite]
-
-
-class VersionedReport(ReportMeta):
-    format_version: int
-
-
-# report: Version2Report implies report['format_version'] == 2
-class Version2Report(VersionedReport):
-    files: Dict[str, Version2File]
-
-
-Report = Union[Version1Report, VersionedReport]
-
-
 def get_jsons(
    jobs: Optional[List[str]],
    summaries: Dict[str, Any],
@ -116,59 +46,6 @@ def get_jsons(
    }


-# TODO: consolidate this with the case_status function from
-# torch/testing/_internal/print_test_stats.py
-def case_status(case: Version1Case) -> Status:
-    for k in {'errored', 'failed', 'skipped'}:
-        if case[k]:  # type: ignore[misc]
-            return cast(Status, k)
-    return None
-
-
-# TODO: consolidate this with the newify_case function from
-# torch/testing/_internal/print_test_stats.py
-def newify_case(case: Version1Case) -> Version2Case:
-    return {
-        'seconds': case['seconds'],
-        'status': case_status(case),
-    }
-
-
-# TODO: consolidate this with the simplify function from
-# torch/testing/_internal/print_test_stats.py
-def get_cases(
-    *,
-    data: Report,
-    filename: Optional[str],
-    suite_name: Optional[str],
-    test_name: str,
-) -> List[Version2Case]:
-    cases: List[Version2Case] = []
-    if 'format_version' not in data:  # version 1 implicitly
-        v1report = cast(Version1Report, data)
-        suites = v1report['suites']
-        for sname, v1suite in suites.items():
-            if sname == suite_name or not suite_name:
-                for v1case in v1suite['cases']:
-                    if v1case['name'] == test_name:
-                        cases.append(newify_case(v1case))
-    else:
-        v_report = cast(VersionedReport, data)
-        version = v_report['format_version']
-        if version == 2:
-            v2report = cast(Version2Report, v_report)
-            for fname, v2file in v2report['files'].items():
-                if fname == filename or not filename:
-                    for sname, v2suite in v2file['suites'].items():
-                        if sname == suite_name or not suite_name:
-                            v2case = v2suite['cases'].get(test_name)
-                            if v2case:
-                                cases.append(v2case)
-        else:
-            raise RuntimeError(f'Unknown format version: {version}')
-    return cases
-
-
 def make_column(
    *,
    data: Optional[Report],
@ -455,9 +332,7 @@ indicated test was not found in that report.
        parser.error('No jobs specified.')

    commits = get_git_commit_history(path=args.pytorch, ref=args.ref)
-
-    s3 = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED))
-    bucket = s3.Bucket('ossci-metrics')
+    bucket = get_S3_bucket_readonly('ossci-metrics')

    display_history(
        bucket=bucket,