pytorch/benchmarks/dynamo/pr_time_benchmarks/check_results.py
Laith Sakka c4fb0b6f33 refresh expected results (#150166)
@huydhn when do you think we will have the APIs to access results on oss storage available so we do not
have to worry about this racing again?
Also is there a way to accelerate unstability in this after we land it?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/150166
Approved by: https://github.com/bobrenjc93, https://github.com/eellison, https://github.com/anijain2305
2025-05-13 04:04:42 +00:00

246 lines
8.3 KiB
Python

import copy
import csv
import json
import sys
from dataclasses import dataclass
import torch._logging.scribe as scribe
@dataclass
class ExpectedFileEntry:
benchmark_name: str
metric_name: str
expected_value: int
noise_margin: float
@dataclass
class ResultFileEntry:
benchmark_name: str
metric_name: str
actual_value: int
def replace_with_zeros(num):
"""
Keeps the first three digits of an integer and replaces the rest with zeros.
Args:
num (int): The number to modify.
Returns:
int: The modified number.
Raises:
ValueError: If the input is not an integer.
"""
# Check if input is an integer
if not isinstance(num, int):
raise ValueError("Input must be an integer")
# Calculate the number of digits to remove
digits_to_remove = len(str(abs(num))) - 4
# Replace digits with zeros
if digits_to_remove > 0:
modified_num = (num // 10**digits_to_remove) * 10**digits_to_remove
else:
modified_num = num
return modified_num
def main():
# Expected file is the file that have the results that we are comparing against.
# Expected has the following format:
# benchmark_name, metric name, expected value, noise margin (as percentage)
# Example:
# add_loop_eager,compile_time_instruction_count,283178305, 0.01 (1% noise margin)
expected_file_path = sys.argv[1]
# Result file is the file that have the results of the current run. It has the following format:
# benchmark_name, metric name, expected value, noise margin (as percentage)
# Example:
# add_loop_eager,compile_time_instruction_count,283178305
result_file_path = sys.argv[2]
# A path where a new expected results file will be written that can be used to replace expected_results.csv
# in case of failure. In case of no failure the content of this file will match expected_file_path.
reference_expected_results_path = sys.argv[3]
# Read expected data file.
expected_data: dict[str, ExpectedFileEntry] = {}
with open(expected_file_path) as f:
reader = csv.reader(f)
for row in reader:
if len(row) == 0:
continue
entry = ExpectedFileEntry(
benchmark_name=row[0].strip(),
metric_name=row[1].strip(),
expected_value=int(row[2]),
noise_margin=float(row[3]),
)
key = (entry.benchmark_name, entry.metric_name)
assert key not in expected_data, f"Duplicate entry for {key}"
expected_data[key] = entry
# Read result data file.
result_data: dict[str, ResultFileEntry] = {}
with open(result_file_path) as f:
reader = csv.reader(f)
for row in reader:
entry = ResultFileEntry(
benchmark_name=row[0].strip(),
metric_name=row[1].strip(),
actual_value=int(row[2]),
)
key = (entry.benchmark_name, entry.metric_name)
assert key not in result_data, f"Duplicate entry for {key}"
result_data[key] = entry
fail = False
new_expected = copy.deepcopy(expected_data)
for key, entry in expected_data.items():
if key not in result_data:
print(f"Missing entry for {key} in result file")
sys.exit(1)
low = entry.expected_value - entry.expected_value * entry.noise_margin
high = entry.expected_value + entry.expected_value * entry.noise_margin
result = result_data[key].actual_value
ratio = float(result - entry.expected_value) * 100 / entry.expected_value
def log(event_name):
scribe.open_source_signpost(
subsystem="pr_time_benchmarks",
name=event_name,
parameters=json.dumps(
{
"benchmark_name": entry.benchmark_name,
"metric_name": entry.metric_name,
"actual_value": result,
"expected_value": entry.expected_value,
"noise_margin": entry.noise_margin,
"change_ratio": ratio,
}
),
)
new_entry = copy.deepcopy(entry)
# only change if abs(ratio) > entry.noise_margin /3.
new_entry.expected_value = (
replace_with_zeros(result)
if abs(ratio) > entry.noise_margin * 100 / 3
else entry.expected_value
)
new_expected[key] = new_entry
if result > high:
fail = True
print(
f"REGRESSION: benchmark {key} failed, actual result {result} "
f"is {ratio:.2f}% higher than expected {entry.expected_value} ±{entry.noise_margin * 100:+.2f}% "
f"if this is an expected regression, please update the expected results.\n"
)
print(
"please update all results that changed significantly, and not only the failed ones"
)
log("fail_regression")
elif result < low:
fail = True
print(
f"WIN: benchmark {key} failed, actual result {result} is {ratio:+.2f}% lower than "
f"expected {entry.expected_value} ±{entry.noise_margin * 100:.2f}% "
f"please OPEN THE TEST RESULTS update ALL BENCHMARKS RESULT with the new printed expected results. ALL ALL ALL\n"
)
print(
"please update all results that changed significantly, and not only the failed ones"
)
log("fail_win")
else:
print(
f"PASS: benchmark {key} pass, actual result {result} {ratio:+.2f}% is within "
f"expected {entry.expected_value} ±{entry.noise_margin * 100:.2f}%\n"
)
log("pass")
# Log all benchmarks that do not have a regression test enabled for them.
for key, entry in result_data.items():
if key not in expected_data:
print(
f"MISSING REGRESSION TEST: benchmark {key} does not have a regression test enabled for it.\n"
)
scribe.open_source_signpost(
subsystem="pr_time_benchmarks",
name="missing_regression_test",
parameters=json.dumps(
{
"benchmark_name": entry.benchmark_name,
"metric_name": entry.metric_name,
}
),
)
with open(reference_expected_results_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
for entry in new_expected.values():
# Write the data to the CSV file
# print(f"{entry.benchmark_name},{entry.metric_name,},{round(entry.expected_value)},{entry.noise_margin}")
writer.writerow(
[
entry.benchmark_name,
entry.metric_name,
entry.expected_value,
entry.noise_margin,
]
)
# Three empty rows for merge conflicts.
writer.writerow([])
writer.writerow([])
writer.writerow([])
print("=" * 80)
print("=" * 80)
print("=" * 80)
print("To update expected results, run the following command:")
print()
print("cat > benchmarks/dynamo/pr_time_benchmarks/expected_results.csv << EOF")
with open(reference_expected_results_path) as f:
print(f.read().rstrip())
print("EOF")
print()
print("=" * 80)
print("=" * 80)
print("=" * 80)
if fail:
print(
f"There was some failures you can use the new reference expected result stored at path:"
f"{reference_expected_results_path} and printed above\n"
)
print(
"To reproduce locally follow the following instructions, note that absolute instructions count are going "
"to be different than on the CI, hence you might want to run locally with and without your change:\n"
"cd benchmarks/dynamo/pr_time_benchmarks/ \n"
"python benchmarks/BENCHMARK.py result.csv \n"
"note that BENCHMARK.py is the name of the file containing the failing benchmark."
)
sys.exit(1)
else:
print("All benchmarks passed")
if __name__ == "__main__":
main()