Add mistral/gpt-oss to benchmarks (#163565)

Potential issues * gpt-oss-20b is probably too big (I can't run on my devserver) * Mistral requires HF authentication * Mistral also takes a while to run the performance checks (need to wait for CI) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163565 Approved by: https://github.com/huydhn
2025-12-06 00:20:18 +01:00 · 2025-09-24 06:12:36 +00:00 · 2025-09-24 06:12:36 +00:00 · dad54ca7c0
commit dad54ca7c0
parent 2c5a3d7e60
15 changed files with 94 additions and 0 deletions
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -78,6 +78,8 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "google/gemma-3-4b-it",
                "openai/whisper-tiny",
                "Qwen/Qwen3-0.6B",
                "mistralai/Mistral-7B-Instruct-v0.3",
                "openai/gpt-oss-20b",
            }
        )
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -61,6 +61,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "google/gemma-3-4b-it",
                "openai/whisper-tiny",
                "Qwen/Qwen3-0.6B",
                "mistralai/Mistral-7B-Instruct-v0.3",
                "openai/gpt-oss-20b",
            }
        )
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 Qwen/Qwen3-0.6B,pass,0
 mistralai/Mistral-7B-Instruct-v0.3,pass,0
 openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -187,3 +187,11 @@ openai/whisper-tiny,fail_to_run,0
 Qwen/Qwen3-0.6B,fail_to_run,0
 mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
 openai/gpt-oss-20b,fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
 mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
 openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
 mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
 openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
 mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
 openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 Qwen/Qwen3-0.6B,pass,0
 mistralai/Mistral-7B-Instruct-v0.3,pass,0
 openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 Qwen/Qwen3-0.6B,pass,0
 mistralai/Mistral-7B-Instruct-v0.3,pass,0
 openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 Qwen/Qwen3-0.6B,pass,0
 mistralai/Mistral-7B-Instruct-v0.3,pass,0
 openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 Qwen/Qwen3-0.6B,pass,0
 mistralai/Mistral-7B-Instruct-v0.3,pass,0
 openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0
 Qwen/Qwen3-0.6B,pass,0
 mistralai/Mistral-7B-Instruct-v0.3,pass,0
 openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -11,6 +11,8 @@ skip:
    - GPTJForQuestionAnswering
    # Model too big
    - google/gemma-3-4b-it
    - openai/gpt-oss-20b
    - mistralai/Mistral-7B-Instruct-v0.3
  device:
    cpu:
@ -19,6 +21,8 @@ skip:
      - google/gemma-3-4b-it
      - openai/whisper-tiny
      - Qwen/Qwen3-0.6B
      - mistralai/Mistral-7B-Instruct-v0.3
      - openai/gpt-oss-20b
  control_flow:
    - AllenaiLongformerBase
@ -79,6 +83,8 @@ batch_size:
    google/gemma-3-4b-it: 8
    openai/whisper-tiny: 8
    Qwen/Qwen3-0.6B: 8
    mistralai/Mistral-7B-Instruct-v0.3: 8
    openai/gpt-oss-20b: 8
 tolerance:
--- a/benchmarks/dynamo/huggingface_llm_models.py
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@ -99,4 +99,6 @@ HF_LLM_MODELS: dict[str, Benchmark] = {
    "google/gemma-3-4b-it": TextGenerationBenchmark,
    "openai/whisper-tiny": WhisperBenchmark,
    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
    "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
    "openai/gpt-oss-20b": TextGenerationBenchmark,
 }
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@ -51,3 +51,5 @@ google/gemma-2-2b,8
 google/gemma-3-4b-it,8
 openai/whisper-tiny,8
 Qwen/Qwen3-0.6B,8
 mistralai/Mistral-7B-Instruct-v0.3, 8
 openai/gpt-oss-20b, 8