Add model name, quantization and device to gpt_fast micro benchmark output (#128091)

A small enhancement to https://hud.pytorch.org/benchmark/llms with these columns in the output. Pull Request resolved: https://github.com/pytorch/pytorch/pull/128091 Approved by: https://github.com/yanboliang
2025-12-06 12:20:52 +01:00 · 2024-06-15 01:39:48 +00:00 · 2024-06-15 01:39:48 +00:00 · f37121bb74
commit f37121bb74
parent 3f47c72268
2 changed files with 43 additions and 21 deletions
--- a/benchmarks/gpt_fast/benchmark.py
+++ b/benchmarks/gpt_fast/benchmark.py
@ -21,6 +21,8 @@ class Experiment:
    metric: str
    target: float
    actual: float
+    dtype: str
+    device: str


 class SimpleMLP(nn.Module):
@ -41,7 +43,7 @@ class SimpleMLP(nn.Module):
        return x


-def run_mlp_layer_norm_gelu():
+def run_mlp_layer_norm_gelu(device: str = "cuda"):
    dtype_flops_utilization_map = {
        torch.bfloat16: "0.71",
    }
@ -53,9 +55,9 @@ def run_mlp_layer_norm_gelu():
        for D in input_shapes:
            mod = SimpleMLP(
                input_dim=D, hidden_dim=intermediate_size, output_dim=D, dtype=dtype
-            ).to("cuda")
+            ).to(device)

-            x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+            x = torch.randn(D, device=device, dtype=torch.bfloat16)

            with FlopCounterMode(display=False) as mode:
                mod(x)
@ -78,12 +80,14 @@ def run_mlp_layer_norm_gelu():
                "flops_utilization",
                expected_flops_utilization,
                f"{flops_utilization:.02f}",
+                dtype_str,
+                device,
            )
        )
    return results


-def run_layer_norm():
+def run_layer_norm(device: str = "cuda"):
    dtype_memory_bandwidth_map = {
        torch.bfloat16: "1017",
    }
@ -93,9 +97,9 @@ def run_layer_norm():
    for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
        memory_bandwidth = 0
        for D in input_shapes:
-            mod = nn.LayerNorm(D).to("cuda")
+            mod = nn.LayerNorm(D).to(device)

-            x = torch.randn(BS, D, device="cuda", dtype=dtype)
+            x = torch.randn(BS, D, device=device, dtype=dtype)

            compiled_mod = torch.compile(mod, dynamic=False)

@ -113,13 +117,15 @@ def run_layer_norm():
                "memory_bandwidth(GB/s)",
                expected_memory_bandwidth,
                f"{memory_bandwidth:.02f}",
+                dtype_str,
+                device,
            )
        )
    return results


@torch._inductor.config.patch(coordinate_descent_tuning=True)
-def run_gather_gemv():
+def run_gather_gemv(device: str = "cuda"):
    E = 8
    dtype_memory_bandwidth_map = {
        torch.int8: "1113",
@ -134,9 +140,9 @@ def run_gather_gemv():
            def gather_gemv(W, score_idxs, x):
                return W[score_idxs].to(x.dtype) @ x

-            W = torch.randn(E, D, D, device="cuda").to(dtype=dtype)
-            x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
-            score_idxs = torch.tensor([3, 5], device="cuda")
+            W = torch.randn(E, D, D, device=device).to(dtype=dtype)
+            x = torch.randn(D, device=device, dtype=torch.bfloat16)
+            score_idxs = torch.tensor([3, 5], device=device)

            compiled_fn = torch.compile(gather_gemv, dynamic=False)

@ -154,13 +160,15 @@ def run_gather_gemv():
                "memory_bandwidth(GB/s)",
                expected_memory_bandwidth,
                f"{memory_bandwidth:.02f}",
+                dtype_str,
+                device,
            )
        )
    return results


@torch._inductor.config.patch(coordinate_descent_tuning=True)
-def run_gemv():
+def run_gemv(device: str = "cuda"):
    dtype_memory_bandwidth_map = {
        torch.int8: "990",
        torch.bfloat16: "1137",
@ -193,6 +201,8 @@ def run_gemv():
                "memory_bandwidth(GB/s)",
                expected_memory_bandwidth,
                f"{memory_bandwidth:.02f}",
+                dtype_str,
+                device,
            )
        )
    return results
--- a/benchmarks/gpt_fast/generate.py
+++ b/benchmarks/gpt_fast/generate.py
@ -172,8 +172,8 @@ def run_experiment(
    max_new_tokens: int = 200,
    top_k: int = 200,
    temperature: float = 0.8,
+    device: str = "cuda",
 ) -> None:
-    device = "cuda"
    print(f"Loading model {x.name}")
    t0 = time.time()
    model = _load_model(x)
@ -221,7 +221,7 @@ def run_experiment(


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
-def run_llama2_7b_bf16():
+def run_llama2_7b_bf16(device: str = "cuda"):
    from benchmark import Experiment

    model = GPTModelConfig(
@ -235,22 +235,26 @@ def run_llama2_7b_bf16():
    token_per_sec, memory_bandwidth = run_experiment(model)
    return [
        Experiment(
-            "llama2_7b_bf16",
+            model.name,
            "token_per_sec",
            model.token_per_sec,
            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
        ),
        Experiment(
-            "llama2_7b_bf16",
+            model.name,
            "memory_bandwidth(GB/s)",
            model.memory_bandwidth,
            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
        ),
    ]


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
-def run_llama2_7b_int8():
+def run_llama2_7b_int8(device: str = "cuda"):
    from benchmark import Experiment

    model = GPTModelConfig(
@ -264,22 +268,26 @@ def run_llama2_7b_int8():
    token_per_sec, memory_bandwidth = run_experiment(model)
    return [
        Experiment(
-            "llama2_7b_int8",
+            model.name,
            "token_per_sec",
            model.token_per_sec,
            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
        ),
        Experiment(
-            "llama2_7b_int8",
+            model.name,
            "memory_bandwidth(GB/s)",
            model.memory_bandwidth,
            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
        ),
    ]


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
-def run_mixtral_8x7b_int8():
+def run_mixtral_8x7b_int8(device: str = "cuda"):
    from benchmark import Experiment

    # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
@ -294,15 +302,19 @@ def run_mixtral_8x7b_int8():
    token_per_sec, memory_bandwidth = run_experiment(model)
    return [
        Experiment(
-            "mixtral_8x7b_int8",
+            model.name,
            "token_per_sec",
            model.token_per_sec,
            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
        ),
        Experiment(
-            "mixtral_8x7b_int8",
+            model.name,
            "memory_bandwidth(GB/s)",
            model.memory_bandwidth,
            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
        ),
    ]