Add model name, quantization and device to gpt_fast micro benchmark output (#128091)

A small enhancement to https://hud.pytorch.org/benchmark/llms with these columns in the output.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/128091
Approved by: https://github.com/yanboliang
This commit is contained in:
Huy Do 2024-06-15 01:39:48 +00:00 committed by PyTorch MergeBot
parent 3f47c72268
commit f37121bb74
2 changed files with 43 additions and 21 deletions

View File

@ -21,6 +21,8 @@ class Experiment:
metric: str
target: float
actual: float
dtype: str
device: str
class SimpleMLP(nn.Module):
@ -41,7 +43,7 @@ class SimpleMLP(nn.Module):
return x
def run_mlp_layer_norm_gelu():
def run_mlp_layer_norm_gelu(device: str = "cuda"):
dtype_flops_utilization_map = {
torch.bfloat16: "0.71",
}
@ -53,9 +55,9 @@ def run_mlp_layer_norm_gelu():
for D in input_shapes:
mod = SimpleMLP(
input_dim=D, hidden_dim=intermediate_size, output_dim=D, dtype=dtype
).to("cuda")
).to(device)
x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
x = torch.randn(D, device=device, dtype=torch.bfloat16)
with FlopCounterMode(display=False) as mode:
mod(x)
@ -78,12 +80,14 @@ def run_mlp_layer_norm_gelu():
"flops_utilization",
expected_flops_utilization,
f"{flops_utilization:.02f}",
dtype_str,
device,
)
)
return results
def run_layer_norm():
def run_layer_norm(device: str = "cuda"):
dtype_memory_bandwidth_map = {
torch.bfloat16: "1017",
}
@ -93,9 +97,9 @@ def run_layer_norm():
for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
memory_bandwidth = 0
for D in input_shapes:
mod = nn.LayerNorm(D).to("cuda")
mod = nn.LayerNorm(D).to(device)
x = torch.randn(BS, D, device="cuda", dtype=dtype)
x = torch.randn(BS, D, device=device, dtype=dtype)
compiled_mod = torch.compile(mod, dynamic=False)
@ -113,13 +117,15 @@ def run_layer_norm():
"memory_bandwidth(GB/s)",
expected_memory_bandwidth,
f"{memory_bandwidth:.02f}",
dtype_str,
device,
)
)
return results
@torch._inductor.config.patch(coordinate_descent_tuning=True)
def run_gather_gemv():
def run_gather_gemv(device: str = "cuda"):
E = 8
dtype_memory_bandwidth_map = {
torch.int8: "1113",
@ -134,9 +140,9 @@ def run_gather_gemv():
def gather_gemv(W, score_idxs, x):
return W[score_idxs].to(x.dtype) @ x
W = torch.randn(E, D, D, device="cuda").to(dtype=dtype)
x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
score_idxs = torch.tensor([3, 5], device="cuda")
W = torch.randn(E, D, D, device=device).to(dtype=dtype)
x = torch.randn(D, device=device, dtype=torch.bfloat16)
score_idxs = torch.tensor([3, 5], device=device)
compiled_fn = torch.compile(gather_gemv, dynamic=False)
@ -154,13 +160,15 @@ def run_gather_gemv():
"memory_bandwidth(GB/s)",
expected_memory_bandwidth,
f"{memory_bandwidth:.02f}",
dtype_str,
device,
)
)
return results
@torch._inductor.config.patch(coordinate_descent_tuning=True)
def run_gemv():
def run_gemv(device: str = "cuda"):
dtype_memory_bandwidth_map = {
torch.int8: "990",
torch.bfloat16: "1137",
@ -193,6 +201,8 @@ def run_gemv():
"memory_bandwidth(GB/s)",
expected_memory_bandwidth,
f"{memory_bandwidth:.02f}",
dtype_str,
device,
)
)
return results

View File

@ -172,8 +172,8 @@ def run_experiment(
max_new_tokens: int = 200,
top_k: int = 200,
temperature: float = 0.8,
device: str = "cuda",
) -> None:
device = "cuda"
print(f"Loading model {x.name}")
t0 = time.time()
model = _load_model(x)
@ -221,7 +221,7 @@ def run_experiment(
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
def run_llama2_7b_bf16():
def run_llama2_7b_bf16(device: str = "cuda"):
from benchmark import Experiment
model = GPTModelConfig(
@ -235,22 +235,26 @@ def run_llama2_7b_bf16():
token_per_sec, memory_bandwidth = run_experiment(model)
return [
Experiment(
"llama2_7b_bf16",
model.name,
"token_per_sec",
model.token_per_sec,
f"{token_per_sec:.02f}",
model.mode,
device,
),
Experiment(
"llama2_7b_bf16",
model.name,
"memory_bandwidth(GB/s)",
model.memory_bandwidth,
f"{memory_bandwidth:.02f}",
model.mode,
device,
),
]
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
def run_llama2_7b_int8():
def run_llama2_7b_int8(device: str = "cuda"):
from benchmark import Experiment
model = GPTModelConfig(
@ -264,22 +268,26 @@ def run_llama2_7b_int8():
token_per_sec, memory_bandwidth = run_experiment(model)
return [
Experiment(
"llama2_7b_int8",
model.name,
"token_per_sec",
model.token_per_sec,
f"{token_per_sec:.02f}",
model.mode,
device,
),
Experiment(
"llama2_7b_int8",
model.name,
"memory_bandwidth(GB/s)",
model.memory_bandwidth,
f"{memory_bandwidth:.02f}",
model.mode,
device,
),
]
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
def run_mixtral_8x7b_int8():
def run_mixtral_8x7b_int8(device: str = "cuda"):
from benchmark import Experiment
# We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
@ -294,15 +302,19 @@ def run_mixtral_8x7b_int8():
token_per_sec, memory_bandwidth = run_experiment(model)
return [
Experiment(
"mixtral_8x7b_int8",
model.name,
"token_per_sec",
model.token_per_sec,
f"{token_per_sec:.02f}",
model.mode,
device,
),
Experiment(
"mixtral_8x7b_int8",
model.name,
"memory_bandwidth(GB/s)",
model.memory_bandwidth,
f"{memory_bandwidth:.02f}",
model.mode,
device,
),
]