Revert D26232345: [pytorch][PR] Report test time regressions

Test Plan: revert-hammer

Differential Revision:
D26232345 (7467f90b13)

Original commit changeset: b687b1737519

fbshipit-source-id: 10a031c5500b083f7c82f2ae2743b671c5a07bff
This commit is contained in:
Sam Estep 2021-02-08 10:12:37 -08:00 committed by Facebook GitHub Bot
parent 1aaddd83a5
commit 21dccbca62
7 changed files with 259 additions and 1326 deletions

View File

@ -642,9 +642,9 @@ jobs:
export CIRCLE_JOB="$CIRCLE_JOB"
export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
cd workspace
python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test
python test/print_test_stats.py --upload-to-s3 test
EOL
echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh
echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh
unbuffer bash command.sh | ts
echo "Retrieving test reports"

View File

@ -201,9 +201,9 @@ jobs:
export CIRCLE_JOB="$CIRCLE_JOB"
export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
cd workspace
python torch/testing/_internal/print_test_stats.py --upload-to-s3 --compare-with-s3 test
python test/print_test_stats.py --upload-to-s3 test
EOL
echo "(cat docker_commands.sh | docker exec -u jenkins -e LANG=C.UTF-8 -i "$id" bash) 2>&1" > command.sh
echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh
unbuffer bash command.sh | ts
echo "Retrieving test reports"

View File

@ -242,9 +242,3 @@ ignore_missing_imports = True
[mypy-mypy.*]
ignore_missing_imports = True
[mypy-xml.*]
ignore_missing_imports = True
[mypy-boto3.*]
ignore_missing_imports = True

254
test/print_test_stats.py Executable file
View File

@ -0,0 +1,254 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Read and print test results statistics
from xml.dom import minidom
from glob import glob
import bz2
import json
import os
import statistics
import time
import boto3
import datetime
import requests
class TestCase:
def __init__(self, dom):
self.class_name = str(dom.attributes['classname'].value)
self.name = str(dom.attributes['name'].value)
self.time = float(dom.attributes['time'].value)
self.errored = len(dom.getElementsByTagName('error')) > 0
self.failed = len(dom.getElementsByTagName('failure')) > 0
self.skipped = len(dom.getElementsByTagName('skipped')) > 0
class TestSuite:
def __init__(self, name):
self.name = name
self.test_cases = []
self.failed_count = 0
self.skipped_count = 0
self.errored_count = 0
self.total_time = 0.0
def __repr__(self):
rc = f'{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}'
if self.skipped_count > 0:
rc += f' skipped: {self.skipped_count}'
return f'TestSuite({rc})'
def append(self, test_case):
self.test_cases.append(test_case)
self.total_time += test_case.time
self.failed_count += 1 if test_case.failed else 0
self.skipped_count += 1 if test_case.skipped else 0
self.errored_count += 1 if test_case.errored else 0
def print_report(self, num_longest=3):
sorted_tests = sorted(self.test_cases, key=lambda x: x.time)
test_count = len(sorted_tests)
print(f"class {self.name}:")
print(f" tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} errored: {self.errored_count}")
print(f" run_time: {self.total_time:.2f} seconds")
print(f" avg_time: {self.total_time/test_count:.2f} seconds")
if test_count >= 2:
print(f" median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds")
sorted_tests = sorted_tests[-num_longest:]
print(f" {len(sorted_tests)} longest tests:")
for test in reversed(sorted_tests):
print(f" {test.name} time: {test.time:.2f} seconds")
print("")
def parse_report(path):
dom = minidom.parse(path)
for test_case in dom.getElementsByTagName('testcase'):
yield TestCase(test_case)
def parse_reports(folder):
reports = glob(os.path.join(folder, '**', '*.xml'), recursive=True)
tests_by_class = dict()
for report in reports:
for test_case in parse_report(report):
class_name = test_case.class_name
if class_name not in tests_by_class:
tests_by_class[class_name] = TestSuite(class_name)
tests_by_class[class_name].append(test_case)
return tests_by_class
def build_info():
return {
"build_pr": os.environ.get("CIRCLE_PR_NUMBER"),
"build_tag": os.environ.get("CIRCLE_TAG"),
"build_sha1": os.environ.get("CIRCLE_SHA1"),
"build_branch": os.environ.get("CIRCLE_BRANCH"),
"build_job": os.environ.get("CIRCLE_JOB"),
"build_workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID"),
}
def build_message(test_case):
return {
"normal": {
**build_info(),
"test_suite_name": test_case.class_name,
"test_case_name": test_case.name,
},
"int": {
"time": int(time.time()),
"test_total_count": 1,
"test_total_time": int(test_case.time * 1000),
"test_failed_count": 1 if test_case.failed > 0 else 0,
"test_skipped_count": 1 if test_case.skipped > 0 else 0,
"test_errored_count": 1 if test_case.errored > 0 else 0,
},
}
def send_report_to_scribe(reports):
access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
if not access_token:
print("No scribe access token provided, skip sending report!")
return
print("Scribe access token provided, sending report...")
url = "https://graph.facebook.com/scribe_logs"
r = requests.post(
url,
data={
"access_token": access_token,
"logs": json.dumps(
[
{
"category": "perfpipe_pytorch_test_times",
"message": json.dumps(build_message(test_case)),
"line_escape": False,
}
for name in sorted(reports.keys())
for test_case in reports[name].test_cases
]
),
},
)
r.raise_for_status()
def send_report_to_s3(reports, *, total_seconds):
job = os.environ.get('CIRCLE_JOB')
sha1 = os.environ.get('CIRCLE_SHA1')
branch = os.environ.get('CIRCLE_BRANCH', '')
if branch not in ['master', 'nightly'] and not branch.startswith("release/"):
print("S3 upload only enabled on master, nightly and release branches.")
print(f"skipping test report on branch: {branch}")
return
now = datetime.datetime.utcnow().isoformat()
key = f'test_time/{sha1}/{job}/{now}Z.json.bz2' # Z meaning UTC
s3 = boto3.resource('s3')
try:
s3.get_bucket_acl(Bucket='ossci-metrics')
except Exception as e:
print(f"AWS ACL failed: {e}")
print("AWS credential found, uploading to S3...")
obj = s3.Object('ossci-metrics', key)
print("")
# use bz2 because the results are smaller than gzip, and the
# compression time penalty we pay is only about half a second for
# input files of a few megabytes in size like these JSON files, and
# because for some reason zlib doesn't seem to play nice with the
# gunzip command whereas Python's bz2 does work with bzip2
obj.put(Body=bz2.compress(json.dumps({
**build_info(),
'total_seconds': total_seconds,
'suites': {
name: {
'total_seconds': suite.total_time,
'cases': [
{
'name': case.name,
'seconds': case.time,
'errored': case.errored,
'failed': case.failed,
'skipped': case.skipped,
}
for case in suite.test_cases
],
}
for name, suite in reports.items()
}
}).encode()))
def positive_integer(value):
parsed = int(value)
if parsed < 1:
raise argparse.ArgumentTypeError(f"{value} is not a natural number")
return parsed
def positive_float(value):
parsed = float(value)
if parsed <= 0.0:
raise argparse.ArgumentTypeError(f"{value} is not a positive rational number")
return parsed
if __name__ == '__main__':
import argparse
import sys
parser = argparse.ArgumentParser(
"Print statistics from test XML output.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--longest-of-class",
type=positive_integer,
default=3,
metavar="N",
help="how many longest tests to show for each class",
)
parser.add_argument(
"--class-print-threshold",
type=positive_float,
default=1.0,
metavar="N",
help="Minimal total time to warrant class report",
)
parser.add_argument(
"--longest-of-run",
type=positive_integer,
default=10,
metavar="N",
help="how many longest tests to show from the entire run",
)
parser.add_argument(
"--upload-to-s3",
action="store_true",
help="upload test time to S3 bucket",
)
parser.add_argument(
"folder",
help="test report folder",
)
args = parser.parse_args()
reports = parse_reports(args.folder)
if len(reports) == 0:
print(f"No test reports found in {args.folder}")
sys.exit(0)
send_report_to_scribe(reports)
longest_tests = []
total_time = 0
for name in sorted(reports.keys()):
test_suite = reports[name]
if test_suite.total_time >= args.class_print_threshold:
test_suite.print_report(args.longest_of_class)
total_time += test_suite.total_time
longest_tests.extend(test_suite.test_cases)
longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:]
if args.upload_to_s3:
send_report_to_s3(reports, total_seconds=total_time)
print(f"Total runtime is {datetime.timedelta(seconds=int(total_time))}")
print(f"{len(longest_tests)} longest tests of entire run:")
for test_case in reversed(longest_tests):
print(f" {test_case.class_name}.{test_case.name} time: {test_case.time:.2f} seconds")

View File

@ -8,7 +8,6 @@ from torch.testing._internal.common_utils import \
from torch.testing._internal.common_device_type import \
(instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
from torch.testing._internal import mypy_wrapper
from torch.testing._internal import print_test_stats
# For testing TestCase methods and torch.testing functions
class TestTesting(TestCase):
@ -546,456 +545,5 @@ class TestMypyWrapper(TestCase):
))
def fakehash(char):
return char * 40
def makecase(name, seconds, *, errored=False, failed=False, skipped=False):
return {
'name': name,
'seconds': seconds,
'errored': errored,
'failed': failed,
'skipped': skipped,
}
def makereport(tests):
suites = {
suite_name: {
'total_seconds': sum(case['seconds'] for case in cases),
'cases': cases,
}
for suite_name, cases in tests.items()
}
return {
'total_seconds': sum(s['total_seconds'] for s in suites.values()),
'suites': suites,
}
class TestPrintTestStats(TestCase):
maxDiff = None
def test_analysis(self):
head_report = makereport({
# input ordering of the suites is ignored
'Grault': [
# not printed: status same and time similar
makecase('test_grault0', 4.78, failed=True),
# status same, but time increased a lot
makecase('test_grault2', 1.473, errored=True),
],
# individual tests times changed, not overall suite
'Qux': [
# input ordering of the test cases is ignored
makecase('test_qux1', 0.001, skipped=True),
makecase('test_qux6', 0.002, skipped=True),
# time in bounds, but status changed
makecase('test_qux4', 7.158, failed=True),
# not printed because it's the same as before
makecase('test_qux7', 0.003, skipped=True),
makecase('test_qux5', 11.968),
makecase('test_qux3', 23.496),
],
# new test suite
'Bar': [
makecase('test_bar2', 3.742, failed=True),
makecase('test_bar1', 50.447),
],
# overall suite time changed but no individual tests
'Norf': [
makecase('test_norf1', 3),
makecase('test_norf2', 3),
makecase('test_norf3', 3),
makecase('test_norf4', 3),
],
# suite doesn't show up if it doesn't change enough
'Foo': [
makecase('test_foo1', 42),
makecase('test_foo2', 56),
],
})
base_reports = {
# bbbb has no reports, so base is cccc instead
fakehash('b'): [],
fakehash('c'): [
makereport({
'Baz': [
makecase('test_baz2', 13.605),
# no recent suites have & skip this test
makecase('test_baz1', 0.004, skipped=True),
],
'Foo': [
makecase('test_foo1', 43),
# test added since dddd
makecase('test_foo2', 57),
],
'Grault': [
makecase('test_grault0', 4.88, failed=True),
makecase('test_grault1', 11.967, failed=True),
makecase('test_grault2', 0.395, errored=True),
makecase('test_grault3', 30.460),
],
'Norf': [
makecase('test_norf1', 2),
makecase('test_norf2', 2),
makecase('test_norf3', 2),
makecase('test_norf4', 2),
],
'Qux': [
makecase('test_qux3', 4.978, errored=True),
makecase('test_qux7', 0.002, skipped=True),
makecase('test_qux2', 5.618),
makecase('test_qux4', 7.766, errored=True),
makecase('test_qux6', 23.589, failed=True),
],
}),
],
fakehash('d'): [
makereport({
'Foo': [
makecase('test_foo1', 40),
# removed in cccc
makecase('test_foo3', 17),
],
'Baz': [
# not skipped, so not included in stdev
makecase('test_baz1', 3.14),
],
'Qux': [
makecase('test_qux7', 0.004, skipped=True),
makecase('test_qux2', 6.02),
makecase('test_qux4', 20.932),
],
'Norf': [
makecase('test_norf1', 3),
makecase('test_norf2', 3),
makecase('test_norf3', 3),
makecase('test_norf4', 3),
],
'Grault': [
makecase('test_grault0', 5, failed=True),
makecase('test_grault1', 14.325, failed=True),
makecase('test_grault2', 0.31, errored=True),
],
}),
],
fakehash('e'): [],
fakehash('f'): [
makereport({
'Foo': [
makecase('test_foo3', 24),
makecase('test_foo1', 43),
],
'Baz': [
makecase('test_baz2', 16.857),
],
'Qux': [
makecase('test_qux2', 6.422),
makecase('test_qux4', 6.382, errored=True),
],
'Norf': [
makecase('test_norf1', 0.9),
makecase('test_norf3', 0.9),
makecase('test_norf2', 0.9),
makecase('test_norf4', 0.9),
],
'Grault': [
makecase('test_grault0', 4.7, failed=True),
makecase('test_grault1', 13.146, failed=True),
makecase('test_grault2', 0.48, errored=True),
],
}),
],
}
simpler_head = print_test_stats.simplify(head_report)
simpler_base = {}
for commit, reports in base_reports.items():
simpler_base[commit] = [print_test_stats.simplify(r) for r in reports]
analysis = print_test_stats.analyze(
head_report=simpler_head,
base_reports=simpler_base,
)
self.assertEqual(
'''\
- class Baz:
- # was 15.23s ± 2.30s
-
- def test_baz1: ...
- # was 0.004s (skipped)
-
- def test_baz2: ...
- # was 15.231s ± 2.300s
class Grault:
# was 48.86s ± 1.19s
# now 6.25s
- def test_grault1: ...
- # was 13.146s ± 1.179s (failed)
- def test_grault3: ...
- # was 30.460s
class Qux:
# was 41.66s ± 1.06s
# now 42.63s
- def test_qux2: ...
- # was 6.020s ± 0.402s
! def test_qux3: ...
! # was 4.978s (errored)
! # now 23.496s
! def test_qux4: ...
! # was 7.074s ± 0.979s (errored)
! # now 7.158s (failed)
! def test_qux6: ...
! # was 23.589s (failed)
! # now 0.002s (skipped)
+ def test_qux1: ...
+ # now 0.001s (skipped)
+ def test_qux5: ...
+ # now 11.968s
+ class Bar:
+ # now 54.19s
+
+ def test_bar1: ...
+ # now 50.447s
+
+ def test_bar2: ...
+ # now 3.742s (failed)
''',
print_test_stats.anomalies(analysis),
)
def test_graph(self):
# HEAD is on master
self.assertEqual(
'''\
Commit graph (base is most recent master ancestor with at least one S3 report):
: (master)
|
* aaaaaaaaaa (HEAD) total time 502.99s
* bbbbbbbbbb (base) 1 report, total time 47.84s
* cccccccccc 1 report, total time 332.50s
* dddddddddd 0 reports
|
:
''',
print_test_stats.graph(
head_sha=fakehash('a'),
head_seconds=502.99,
base_seconds={
fakehash('b'): [47.84],
fakehash('c'): [332.50],
fakehash('d'): [],
},
on_master=True,
)
)
self.assertEqual(
'''\
Commit graph (base is most recent master ancestor with at least one S3 report):
: (master)
|
| * aaaaaaaaaa (HEAD) total time 9988.77s
|/
* bbbbbbbbbb (base) 121 reports, total time 7654.32s ± 55.55s
* cccccccccc 20 reports, total time 5555.55s ± 253.19s
* dddddddddd 1 report, total time 1234.56s
|
:
''',
print_test_stats.graph(
head_sha=fakehash('a'),
head_seconds=9988.77,
base_seconds={
fakehash('b'): [7598.77] * 60 + [7654.32] + [7709.87] * 60,
fakehash('c'): [5308.77] * 10 + [5802.33] * 10,
fakehash('d'): [1234.56],
},
on_master=False,
)
)
self.assertEqual(
'''\
Commit graph (base is most recent master ancestor with at least one S3 report):
: (master)
|
| * aaaaaaaaaa (HEAD) total time 25.52s
| |
| : (5 commits)
|/
* bbbbbbbbbb 0 reports
* cccccccccc 0 reports
* dddddddddd (base) 15 reports, total time 58.92s ± 25.82s
|
:
''',
print_test_stats.graph(
head_sha=fakehash('a'),
head_seconds=25.52,
base_seconds={
fakehash('b'): [],
fakehash('c'): [],
fakehash('d'): [52.25] * 14 + [152.26],
},
on_master=False,
ancestry_path=5,
)
)
self.assertEqual(
'''\
Commit graph (base is most recent master ancestor with at least one S3 report):
: (master)
|
| * aaaaaaaaaa (HEAD) total time 0.08s
|/|
| : (1 commit)
|
* bbbbbbbbbb 0 reports
* cccccccccc (base) 1 report, total time 0.09s
* dddddddddd 3 reports, total time 0.10s ± 0.05s
|
:
''',
print_test_stats.graph(
head_sha=fakehash('a'),
head_seconds=0.08,
base_seconds={
fakehash('b'): [],
fakehash('c'): [0.09],
fakehash('d'): [0.05, 0.10, 0.15],
},
on_master=False,
other_ancestors=1,
)
)
self.assertEqual(
'''\
Commit graph (base is most recent master ancestor with at least one S3 report):
: (master)
|
| * aaaaaaaaaa (HEAD) total time 5.98s
| |
| : (1 commit)
|/|
| : (7 commits)
|
* bbbbbbbbbb (base) 2 reports, total time 6.02s ± 1.71s
* cccccccccc 0 reports
* dddddddddd 10 reports, total time 5.84s ± 0.92s
|
:
''',
print_test_stats.graph(
head_sha=fakehash('a'),
head_seconds=5.98,
base_seconds={
fakehash('b'): [4.81, 7.23],
fakehash('c'): [],
fakehash('d'): [4.97] * 5 + [6.71] * 5,
},
on_master=False,
ancestry_path=1,
other_ancestors=7,
)
)
def test_regression_info(self):
self.assertEqual(
'''\
----- Historic stats comparison result ------
job: foo_job
commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
class Foo:
# was 42.50s ± 2.12s
# now 3.02s
- def test_bar: ...
- # was 1.000s
! def test_foo: ...
! # was 41.500s ± 2.121s
! # now 0.020s (skipped)
+ def test_baz: ...
+ # now 3.000s
Commit graph (base is most recent master ancestor with at least one S3 report):
: (master)
|
| * aaaaaaaaaa (HEAD) total time 3.02s
|/
* bbbbbbbbbb (base) 1 report, total time 41.00s
* cccccccccc 1 report, total time 43.00s
|
:
Removed (across 1 suite) 1 test, totaling - 1.00s
Modified (across 1 suite) 1 test, totaling - 41.48s ± 2.12s
Added (across 1 suite) 1 test, totaling + 3.00s
''',
print_test_stats.regression_info(
head_sha=fakehash('a'),
head_report=makereport({
'Foo': [
makecase('test_foo', 0.02, skipped=True),
makecase('test_baz', 3),
]}),
base_reports={
fakehash('b'): [
makereport({
'Foo': [
makecase('test_foo', 40),
makecase('test_bar', 1),
],
}),
],
fakehash('c'): [
makereport({
'Foo': [
makecase('test_foo', 43),
],
}),
],
},
job_name='foo_job',
on_master=False,
ancestry_path=0,
other_ancestors=0,
)
)
if __name__ == '__main__':
run_tests()

View File

@ -31,6 +31,7 @@ from .utils import (
BLOCKED_PYTHON_TESTS = {
"run_test.py",
"print_test_stats.py",
"test_dataloader.py",
"test_multiprocessing.py",
"test_multiprocessing_spawn.py",

View File

@ -1,864 +0,0 @@
#!/usr/bin/env python
import bz2
import datetime
import json
import math
import os
import statistics
import subprocess
import time
from collections import defaultdict
from glob import glob
from pathlib import Path
from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
from xml.dom import minidom
import requests
from typing_extensions import TypedDict
try:
import boto3
HAVE_BOTO3 = True
except ImportError:
HAVE_BOTO3 = False
Commit = str # 40-digit SHA-1 hex string
Status = Optional[str] # errored, failed, skipped, or None
# represent suite as dict because indexing is useful
SimplerCase = Tuple[float, Status]
SimplerSuite = Dict[str, SimplerCase]
SimplerReport = Dict[str, SimplerSuite]
class Case(TypedDict):
name: str
seconds: float
errored: bool
failed: bool
skipped: bool
class Suite(TypedDict):
total_seconds: float
cases: List[Case]
class ReportMeta(TypedDict, total=False):
build_pr: str
build_tag: str
build_sha1: Commit
build_branch: str
build_job: str
build_workflow_id: str
class Report(ReportMeta):
total_seconds: float
suites: Dict[str, Suite]
class Stat(TypedDict):
center: float
spread: Optional[float]
class CaseDiff(TypedDict):
margin: str
name: str
was: Optional[Tuple[Stat, Status]]
now: Optional[SimplerCase]
class SuiteDiff(TypedDict):
margin: str
name: str
was: Optional[Stat]
now: Optional[float]
cases: List[CaseDiff]
def case_status(case: Case) -> Status:
for k in {'errored', 'failed', 'skipped'}:
if case[k]: # type: ignore
return k
return None
def simplify(report: Report) -> SimplerReport:
return {
suite_name: {
case['name']: (case['seconds'], case_status(case))
for case in suite['cases']
}
for suite_name, suite in report['suites'].items()
}
def plural(n: int) -> str:
return '' if n == 1 else 's'
def display_stat(
x: Stat,
format: Tuple[Tuple[int, int], Tuple[int, int]],
) -> str:
spread_len = format[1][0] + 1 + format[1][1]
spread = x['spread']
if spread is not None:
spread_str = f' ± {spread:{spread_len}.{format[1][1]}f}s'
else:
spread_str = ' ' * (3 + spread_len + 1)
mean_len = format[0][0] + 1 + format[0][1]
return f'{x["center"]:{mean_len}.{format[0][1]}f}s{spread_str}'
def list_stat(l: List[float]) -> Stat:
return {
'center': statistics.mean(l),
'spread': statistics.stdev(l) if len(l) > 1 else None
}
def zero_stat() -> Stat:
return {'center': 0, 'spread': None}
def recenter(was: Stat, now: float) -> Stat:
return {'center': now - was['center'], 'spread': was['spread']}
def sum_normals(stats: Iterable[Stat]) -> Stat:
"""
Returns a stat corresponding to the sum of the given stats.
Assumes that the center and spread for each of the given stats are
mean and stdev, respectively.
"""
l = list(stats)
spread: Optional[float]
if any(stat['spread'] is not None for stat in l):
spread = math.sqrt(sum((stat['spread'] or 0)**2 for stat in l))
else:
spread = None
return {
'center': sum(stat['center'] for stat in l),
'spread': spread,
}
def format_seconds(seconds: List[float]) -> str:
if len(seconds) > 0:
x = list_stat(seconds)
return f'total time {display_stat(x, ((5, 2), (4, 2)))}'.strip()
return ''
def show_ancestors(num_commits: int) -> str:
return f' | : ({num_commits} commit{plural(num_commits)})'
def unlines(lines: List[str]) -> str:
return ''.join(f'{line}\n' for line in lines)
def matching_test_times(
base_reports: Dict[Commit, List[SimplerReport]],
suite_name: str,
case_name: str,
status: Status,
) -> List[float]:
times: List[float] = []
for reports in base_reports.values():
for report in reports:
suite = report.get(suite_name)
if suite:
case = suite.get(case_name)
if case:
t, s = case
if s == status:
times.append(t)
return times
def analyze(
*,
head_report: SimplerReport,
base_reports: Dict[Commit, List[SimplerReport]],
) -> List[SuiteDiff]:
# most recent master ancestor with at least one S3 report
base_sha = next(sha for sha, reports in base_reports.items() if reports)
# find all relevant suites (those in either base or head or both)
all_reports = [head_report] + base_reports[base_sha]
all_suites = {k for r in all_reports for k in r.keys()}
removed_suites: List[SuiteDiff] = []
modified_suites: List[SuiteDiff] = []
added_suites: List[SuiteDiff] = []
for suite_name in sorted(all_suites):
case_diffs: List[CaseDiff] = []
head_suite = head_report.get(suite_name)
base_cases: Dict[str, Status] = dict(sorted(set.intersection(*[
{(n, s) for n, (_, s) in report.get(suite_name, {}).items()}
for report in base_reports[base_sha]
] or [set()])))
case_stats: Dict[str, Stat] = {}
if head_suite:
now = sum(case[0] for case in head_suite.values())
if any(suite_name in report for report in base_reports[base_sha]):
removed_cases: List[CaseDiff] = []
for case_name, case_status in base_cases.items():
case_stats[case_name] = list_stat(matching_test_times(
base_reports,
suite_name,
case_name,
case_status,
))
if case_name not in head_suite:
removed_cases.append({
'margin': '-',
'name': case_name,
'was': (case_stats[case_name], case_status),
'now': None,
})
modified_cases: List[CaseDiff] = []
added_cases: List[CaseDiff] = []
for head_case_name in sorted(head_suite):
head_case = head_suite[head_case_name]
if head_case_name in base_cases:
stat = case_stats[head_case_name]
base_status = base_cases[head_case_name]
if head_case[1] != base_status:
modified_cases.append({
'margin': '!',
'name': head_case_name,
'was': (stat, base_status),
'now': head_case,
})
else:
added_cases.append({
'margin': '+',
'name': head_case_name,
'was': None,
'now': head_case,
})
# there might be a bug calculating this stdev, not sure
was = sum_normals(case_stats.values())
case_diffs = removed_cases + modified_cases + added_cases
if case_diffs:
modified_suites.append({
'margin': ' ',
'name': suite_name,
'was': was,
'now': now,
'cases': case_diffs,
})
else:
for head_case_name in sorted(head_suite):
head_case = head_suite[head_case_name]
case_diffs.append({
'margin': ' ',
'name': head_case_name,
'was': None,
'now': head_case,
})
added_suites.append({
'margin': '+',
'name': suite_name,
'was': None,
'now': now,
'cases': case_diffs,
})
else:
for case_name, case_status in base_cases.items():
case_stats[case_name] = list_stat(matching_test_times(
base_reports,
suite_name,
case_name,
case_status,
))
case_diffs.append({
'margin': ' ',
'name': case_name,
'was': (case_stats[case_name], case_status),
'now': None,
})
removed_suites.append({
'margin': '-',
'name': suite_name,
# there might be a bug calculating this stdev, not sure
'was': sum_normals(case_stats.values()),
'now': None,
'cases': case_diffs,
})
return removed_suites + modified_suites + added_suites
def case_diff_lines(diff: CaseDiff) -> List[str]:
lines = [f'def {diff["name"]}: ...']
case_fmt = ((3, 3), (2, 3))
was = diff['was']
if was:
was_line = f' # was {display_stat(was[0], case_fmt)}'
was_status = was[1]
if was_status:
was_line += f' ({was_status})'
lines.append(was_line)
now = diff['now']
if now:
now_stat: Stat = {'center': now[0], 'spread': None}
now_line = f' # now {display_stat(now_stat, case_fmt)}'
now_status = now[1]
if now_status:
now_line += f' ({now_status})'
lines.append(now_line)
return [''] + [f'{diff["margin"]} {l}' for l in lines]
def display_suite_diff(diff: SuiteDiff) -> str:
lines = [f'class {diff["name"]}:']
suite_fmt = ((4, 2), (3, 2))
was = diff['was']
if was:
lines.append(f' # was {display_stat(was, suite_fmt)}')
now = diff['now']
if now is not None:
now_stat: Stat = {'center': now, 'spread': None}
lines.append(f' # now {display_stat(now_stat, suite_fmt)}')
for case_diff in diff['cases']:
lines.extend([f' {l}' for l in case_diff_lines(case_diff)])
return unlines([''] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + [''])
def anomalies(diffs: List[SuiteDiff]) -> str:
return ''.join(map(display_suite_diff, diffs))
def graph(
*,
head_sha: Commit,
head_seconds: float,
base_seconds: Dict[Commit, List[float]],
on_master: bool,
ancestry_path: int = 0,
other_ancestors: int = 0,
) -> str:
lines = [
'Commit graph (base is most recent master ancestor with at least one S3 report):',
'',
' : (master)',
' |',
]
head_time_str = f' {format_seconds([head_seconds])}'
if on_master:
lines.append(f' * {head_sha[:10]} (HEAD) {head_time_str}')
else:
lines.append(f' | * {head_sha[:10]} (HEAD) {head_time_str}')
if ancestry_path > 0:
lines += [
' | |',
show_ancestors(ancestry_path),
]
if other_ancestors > 0:
lines += [
' |/|',
show_ancestors(other_ancestors),
' |',
]
else:
lines.append(' |/')
is_first = True
for sha, seconds in base_seconds.items():
num_runs = len(seconds)
prefix = str(num_runs).rjust(3)
base = '(base)' if is_first and num_runs > 0 else ' '
if num_runs > 0:
is_first = False
t = format_seconds(seconds)
p = plural(num_runs)
if t:
p = f'{p}, '.ljust(3)
lines.append(f' * {sha[:10]} {base} {prefix} report{p}{t}')
lines.extend([' |', ' :'])
return unlines(lines)
def case_delta(case: CaseDiff) -> Stat:
was = case['was']
now = case['now']
return recenter(
was[0] if was else zero_stat(),
now[0] if now else 0,
)
def display_final_stat(stat: Stat) -> str:
center = stat['center']
spread = stat['spread']
displayed = display_stat(
{'center': abs(center), 'spread': spread},
((4, 2), (3, 2)),
)
if center < 0:
sign = '-'
elif center > 0:
sign = '+'
else:
sign = ' '
return f'{sign}{displayed}'.rstrip()
def summary_line(message: str, d: DefaultDict[str, List[CaseDiff]]) -> str:
all_cases = [c for cs in d.values() for c in cs]
tests = len(all_cases)
suites = len(d)
sp = f'{plural(suites)})'.ljust(2)
tp = f'{plural(tests)},'.ljust(2)
# there might be a bug calculating this stdev, not sure
stat = sum_normals(case_delta(c) for c in all_cases)
return ''.join([
f'{message} (across {suites:>4} suite{sp}',
f'{tests:>6} test{tp}',
f' totaling {display_final_stat(stat)}',
])
def summary(analysis: List[SuiteDiff]) -> str:
removed_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list)
modified_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list)
added_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list)
for diff in analysis:
# the use of 'margin' here is not the most elegant
name = diff['name']
margin = diff['margin']
cases = diff['cases']
if margin == '-':
removed_tests[name] += cases
elif margin == '+':
added_tests[name] += cases
else:
removed = list(filter(lambda c: c['margin'] == '-', cases))
added = list(filter(lambda c: c['margin'] == '+', cases))
modified = list(filter(lambda c: c['margin'] == '!', cases))
if removed:
removed_tests[name] += removed
if added:
added_tests[name] += added
if modified:
modified_tests[name] += modified
return unlines([
summary_line('Removed ', removed_tests),
summary_line('Modified', modified_tests),
summary_line('Added ', added_tests),
])
def regression_info(
*,
head_sha: Commit,
head_report: Report,
base_reports: Dict[Commit, List[Report]],
job_name: str,
on_master: bool,
ancestry_path: int,
other_ancestors: int,
) -> str:
"""
Return a human-readable report describing any test time regressions.
The head_sha and head_report args give info about the current commit
and its test times. Since Python dicts maintain insertion order
(guaranteed as part of the language spec since 3.7), the
base_reports argument must list the head's several most recent
master commits, from newest to oldest (so the merge-base is
list(base_reports)[0]).
"""
simpler_head = simplify(head_report)
simpler_base: Dict[Commit, List[SimplerReport]] = {}
for commit, reports in base_reports.items():
simpler_base[commit] = [simplify(r) for r in reports]
analysis = analyze(
head_report=simpler_head,
base_reports=simpler_base,
)
return '\n'.join([
unlines([
'----- Historic stats comparison result ------',
'',
f' job: {job_name}',
f' commit: {head_sha}',
]),
anomalies(analysis),
graph(
head_sha=head_sha,
head_seconds=head_report['total_seconds'],
base_seconds={
c: [r['total_seconds'] for r in rs]
for c, rs in base_reports.items()
},
on_master=on_master,
ancestry_path=ancestry_path,
other_ancestors=other_ancestors,
),
summary(analysis),
])
class TestCase:
def __init__(self, dom) -> None:
self.class_name = str(dom.attributes['classname'].value)
self.name = str(dom.attributes['name'].value)
self.time = float(dom.attributes['time'].value)
self.errored = len(dom.getElementsByTagName('error')) > 0
self.failed = len(dom.getElementsByTagName('failure')) > 0
self.skipped = len(dom.getElementsByTagName('skipped')) > 0
class TestSuite:
def __init__(self, name) -> None:
self.name = name
self.test_cases: List[TestCase] = []
self.failed_count = 0
self.skipped_count = 0
self.errored_count = 0
self.total_time = 0.0
def __repr__(self):
rc = f'{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}'
if self.skipped_count > 0:
rc += f' skipped: {self.skipped_count}'
return f'TestSuite({rc})'
def append(self, test_case):
self.test_cases.append(test_case)
self.total_time += test_case.time
self.failed_count += 1 if test_case.failed else 0
self.skipped_count += 1 if test_case.skipped else 0
self.errored_count += 1 if test_case.errored else 0
def print_report(self, num_longest=3):
sorted_tests = sorted(self.test_cases, key=lambda x: x.time)
test_count = len(sorted_tests)
print(f"class {self.name}:")
print(f" tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} errored: {self.errored_count}")
print(f" run_time: {self.total_time:.2f} seconds")
print(f" avg_time: {self.total_time/test_count:.2f} seconds")
if test_count >= 2:
print(f" median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds")
sorted_tests = sorted_tests[-num_longest:]
print(f" {len(sorted_tests)} longest tests:")
for test in reversed(sorted_tests):
print(f" {test.name} time: {test.time:.2f} seconds")
print("")
def parse_report(path):
dom = minidom.parse(path)
for test_case in dom.getElementsByTagName('testcase'):
yield TestCase(test_case)
def parse_reports(folder):
reports = glob(os.path.join(folder, '**', '*.xml'), recursive=True)
tests_by_class = dict()
for report in reports:
for test_case in parse_report(report):
class_name = test_case.class_name
if class_name not in tests_by_class:
tests_by_class[class_name] = TestSuite(class_name)
tests_by_class[class_name].append(test_case)
return tests_by_class
def build_info():
return {
"build_pr": os.environ.get("CIRCLE_PR_NUMBER"),
"build_tag": os.environ.get("CIRCLE_TAG"),
"build_sha1": os.environ.get("CIRCLE_SHA1"),
"build_branch": os.environ.get("CIRCLE_BRANCH"),
"build_job": os.environ.get("CIRCLE_JOB"),
"build_workflow_id": os.environ.get("CIRCLE_WORKFLOW_ID"),
}
def build_message(test_case):
return {
"normal": {
**build_info(),
"test_suite_name": test_case.class_name,
"test_case_name": test_case.name,
},
"int": {
"time": int(time.time()),
"test_total_count": 1,
"test_total_time": int(test_case.time * 1000),
"test_failed_count": 1 if test_case.failed > 0 else 0,
"test_skipped_count": 1 if test_case.skipped > 0 else 0,
"test_errored_count": 1 if test_case.errored > 0 else 0,
},
}
def send_report_to_scribe(reports):
access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
if not access_token:
print("No scribe access token provided, skip sending report!")
return
print("Scribe access token provided, sending report...")
url = "https://graph.facebook.com/scribe_logs"
r = requests.post(
url,
data={
"access_token": access_token,
"logs": json.dumps(
[
{
"category": "perfpipe_pytorch_test_times",
"message": json.dumps(build_message(test_case)),
"line_escape": False,
}
for name in sorted(reports.keys())
for test_case in reports[name].test_cases
]
),
},
)
r.raise_for_status()
def assemble_s3_object(reports, *, total_seconds):
return {
**build_info(),
'total_seconds': total_seconds,
'suites': {
name: {
'total_seconds': suite.total_time,
'cases': [
{
'name': case.name,
'seconds': case.time,
'errored': case.errored,
'failed': case.failed,
'skipped': case.skipped,
}
for case in suite.test_cases
],
}
for name, suite in reports.items()
}
}
def send_report_to_s3(obj):
job = os.environ.get('CIRCLE_JOB')
sha1 = os.environ.get('CIRCLE_SHA1')
branch = os.environ.get('CIRCLE_BRANCH', '')
if branch not in ['master', 'nightly'] and not branch.startswith("release/"):
print("S3 upload only enabled on master, nightly and release branches.")
print(f"skipping test report on branch: {branch}")
return
now = datetime.datetime.utcnow().isoformat()
key = f'test_time/{sha1}/{job}/{now}Z.json.bz2' # Z meaning UTC
s3 = boto3.resource('s3')
try:
s3.get_bucket_acl(Bucket='ossci-metrics')
except Exception as e:
print(f"AWS ACL failed: {e}")
print("AWS credential found, uploading to S3...")
obj = s3.Object('ossci-metrics', key)
print("")
# use bz2 because the results are smaller than gzip, and the
# compression time penalty we pay is only about half a second for
# input files of a few megabytes in size like these JSON files, and
# because for some reason zlib doesn't seem to play nice with the
# gunzip command whereas Python's bz2 does work with bzip2
obj.put(Body=bz2.compress(json.dumps(obj).encode()))
def print_regressions(obj, *, num_prev_commits):
sha1 = os.environ.get("CIRCLE_SHA1", "HEAD")
base = subprocess.check_output(
["git", "merge-base", sha1, "origin/master"],
encoding="ascii",
).strip()
count_spec = f"{base}..{sha1}"
intermediate_commits = int(subprocess.check_output(
["git", "rev-list", "--count", count_spec],
encoding="ascii"
))
ancestry_path = int(subprocess.check_output(
["git", "rev-list", "--ancestry-path", "--count", count_spec],
encoding="ascii",
))
# if current commit is already on master, we need to exclude it from
# this history; otherwise we include the merge-base
commits = subprocess.check_output(
["git", "rev-list", f"--max-count={num_prev_commits+1}", base],
encoding="ascii",
).splitlines()
on_master = False
if base == sha1:
on_master = True
commits = commits[1:]
else:
commits = commits[:-1]
job = os.environ.get("CIRCLE_JOB", "")
s3 = boto3.resource("s3")
bucket = s3.Bucket(name="ossci-metrics")
index = {}
for commit in commits:
summaries = bucket.objects.filter(Prefix=f"test_time/{commit}/{job}/")
index[commit] = list(summaries)
objects: Dict[Commit, List[Report]] = {}
# should we do these in parallel?
for commit, summaries in index.items():
objects[commit] = []
for summary in summaries:
binary = summary.get()["Body"].read()
string = bz2.decompress(binary).decode("utf-8")
objects[commit].append(json.loads(string))
print()
print(regression_info(
head_sha=sha1,
head_report=obj,
base_reports=objects,
job_name=job,
on_master=on_master,
ancestry_path=ancestry_path - 1,
other_ancestors=intermediate_commits - ancestry_path,
), end="")
def positive_integer(value):
parsed = int(value)
if parsed < 1:
raise argparse.ArgumentTypeError(f"{value} is not a natural number")
return parsed
def positive_float(value):
parsed = float(value)
if parsed <= 0.0:
raise argparse.ArgumentTypeError(f"{value} is not a positive rational number")
return parsed
if __name__ == '__main__':
import argparse
import sys
parser = argparse.ArgumentParser(
"Print statistics from test XML output.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--longest-of-class",
type=positive_integer,
default=3,
metavar="N",
help="how many longest tests to show for each class",
)
parser.add_argument(
"--class-print-threshold",
type=positive_float,
default=1.0,
metavar="N",
help="Minimal total time to warrant class report",
)
parser.add_argument(
"--longest-of-run",
type=positive_integer,
default=10,
metavar="N",
help="how many longest tests to show from the entire run",
)
if HAVE_BOTO3:
parser.add_argument(
"--upload-to-s3",
action="store_true",
help="upload test time to S3 bucket",
)
parser.add_argument(
"--compare-with-s3",
action="store_true",
help="download test times for base commits and compare",
)
parser.add_argument(
"--num-prev-commits",
type=positive_integer,
default=10,
metavar="N",
help="how many previous commits to compare test times with",
)
parser.add_argument(
"--use-json",
metavar="FILE.json",
help="compare S3 with JSON file, instead of the test report folder",
)
parser.add_argument(
"folder",
help="test report folder",
)
args = parser.parse_args()
reports = parse_reports(args.folder)
if len(reports) == 0:
print(f"No test reports found in {args.folder}")
sys.exit(0)
send_report_to_scribe(reports)
longest_tests = []
total_time = 0
for name in sorted(reports.keys()):
test_suite = reports[name]
if test_suite.total_time >= args.class_print_threshold:
test_suite.print_report(args.longest_of_class)
total_time += test_suite.total_time
longest_tests.extend(test_suite.test_cases)
longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:]
obj = assemble_s3_object(reports, total_seconds=total_time)
if args.upload_to_s3:
send_report_to_s3(obj)
print(f"Total runtime is {datetime.timedelta(seconds=int(total_time))}")
print(f"{len(longest_tests)} longest tests of entire run:")
for test_case in reversed(longest_tests):
print(f" {test_case.class_name}.{test_case.name} time: {test_case.time:.2f} seconds")
if args.compare_with_s3:
head_json = obj
if args.use_json:
head_json = json.loads(Path(args.use_json).read_text())
print_regressions(head_json, num_prev_commits=args.num_prev_commits)