mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Add model name, quantization and device to gpt_fast micro benchmark output (#128091)
A small enhancement to https://hud.pytorch.org/benchmark/llms with these columns in the output. Pull Request resolved: https://github.com/pytorch/pytorch/pull/128091 Approved by: https://github.com/yanboliang
This commit is contained in:
parent
3f47c72268
commit
f37121bb74
|
|
@ -21,6 +21,8 @@ class Experiment:
|
|||
metric: str
|
||||
target: float
|
||||
actual: float
|
||||
dtype: str
|
||||
device: str
|
||||
|
||||
|
||||
class SimpleMLP(nn.Module):
|
||||
|
|
@ -41,7 +43,7 @@ class SimpleMLP(nn.Module):
|
|||
return x
|
||||
|
||||
|
||||
def run_mlp_layer_norm_gelu():
|
||||
def run_mlp_layer_norm_gelu(device: str = "cuda"):
|
||||
dtype_flops_utilization_map = {
|
||||
torch.bfloat16: "0.71",
|
||||
}
|
||||
|
|
@ -53,9 +55,9 @@ def run_mlp_layer_norm_gelu():
|
|||
for D in input_shapes:
|
||||
mod = SimpleMLP(
|
||||
input_dim=D, hidden_dim=intermediate_size, output_dim=D, dtype=dtype
|
||||
).to("cuda")
|
||||
).to(device)
|
||||
|
||||
x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
|
||||
x = torch.randn(D, device=device, dtype=torch.bfloat16)
|
||||
|
||||
with FlopCounterMode(display=False) as mode:
|
||||
mod(x)
|
||||
|
|
@ -78,12 +80,14 @@ def run_mlp_layer_norm_gelu():
|
|||
"flops_utilization",
|
||||
expected_flops_utilization,
|
||||
f"{flops_utilization:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def run_layer_norm():
|
||||
def run_layer_norm(device: str = "cuda"):
|
||||
dtype_memory_bandwidth_map = {
|
||||
torch.bfloat16: "1017",
|
||||
}
|
||||
|
|
@ -93,9 +97,9 @@ def run_layer_norm():
|
|||
for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
|
||||
memory_bandwidth = 0
|
||||
for D in input_shapes:
|
||||
mod = nn.LayerNorm(D).to("cuda")
|
||||
mod = nn.LayerNorm(D).to(device)
|
||||
|
||||
x = torch.randn(BS, D, device="cuda", dtype=dtype)
|
||||
x = torch.randn(BS, D, device=device, dtype=dtype)
|
||||
|
||||
compiled_mod = torch.compile(mod, dynamic=False)
|
||||
|
||||
|
|
@ -113,13 +117,15 @@ def run_layer_norm():
|
|||
"memory_bandwidth(GB/s)",
|
||||
expected_memory_bandwidth,
|
||||
f"{memory_bandwidth:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@torch._inductor.config.patch(coordinate_descent_tuning=True)
|
||||
def run_gather_gemv():
|
||||
def run_gather_gemv(device: str = "cuda"):
|
||||
E = 8
|
||||
dtype_memory_bandwidth_map = {
|
||||
torch.int8: "1113",
|
||||
|
|
@ -134,9 +140,9 @@ def run_gather_gemv():
|
|||
def gather_gemv(W, score_idxs, x):
|
||||
return W[score_idxs].to(x.dtype) @ x
|
||||
|
||||
W = torch.randn(E, D, D, device="cuda").to(dtype=dtype)
|
||||
x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
|
||||
score_idxs = torch.tensor([3, 5], device="cuda")
|
||||
W = torch.randn(E, D, D, device=device).to(dtype=dtype)
|
||||
x = torch.randn(D, device=device, dtype=torch.bfloat16)
|
||||
score_idxs = torch.tensor([3, 5], device=device)
|
||||
|
||||
compiled_fn = torch.compile(gather_gemv, dynamic=False)
|
||||
|
||||
|
|
@ -154,13 +160,15 @@ def run_gather_gemv():
|
|||
"memory_bandwidth(GB/s)",
|
||||
expected_memory_bandwidth,
|
||||
f"{memory_bandwidth:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@torch._inductor.config.patch(coordinate_descent_tuning=True)
|
||||
def run_gemv():
|
||||
def run_gemv(device: str = "cuda"):
|
||||
dtype_memory_bandwidth_map = {
|
||||
torch.int8: "990",
|
||||
torch.bfloat16: "1137",
|
||||
|
|
@ -193,6 +201,8 @@ def run_gemv():
|
|||
"memory_bandwidth(GB/s)",
|
||||
expected_memory_bandwidth,
|
||||
f"{memory_bandwidth:.02f}",
|
||||
dtype_str,
|
||||
device,
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -172,8 +172,8 @@ def run_experiment(
|
|||
max_new_tokens: int = 200,
|
||||
top_k: int = 200,
|
||||
temperature: float = 0.8,
|
||||
device: str = "cuda",
|
||||
) -> None:
|
||||
device = "cuda"
|
||||
print(f"Loading model {x.name}")
|
||||
t0 = time.time()
|
||||
model = _load_model(x)
|
||||
|
|
@ -221,7 +221,7 @@ def run_experiment(
|
|||
|
||||
|
||||
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
|
||||
def run_llama2_7b_bf16():
|
||||
def run_llama2_7b_bf16(device: str = "cuda"):
|
||||
from benchmark import Experiment
|
||||
|
||||
model = GPTModelConfig(
|
||||
|
|
@ -235,22 +235,26 @@ def run_llama2_7b_bf16():
|
|||
token_per_sec, memory_bandwidth = run_experiment(model)
|
||||
return [
|
||||
Experiment(
|
||||
"llama2_7b_bf16",
|
||||
model.name,
|
||||
"token_per_sec",
|
||||
model.token_per_sec,
|
||||
f"{token_per_sec:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
),
|
||||
Experiment(
|
||||
"llama2_7b_bf16",
|
||||
model.name,
|
||||
"memory_bandwidth(GB/s)",
|
||||
model.memory_bandwidth,
|
||||
f"{memory_bandwidth:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
|
||||
def run_llama2_7b_int8():
|
||||
def run_llama2_7b_int8(device: str = "cuda"):
|
||||
from benchmark import Experiment
|
||||
|
||||
model = GPTModelConfig(
|
||||
|
|
@ -264,22 +268,26 @@ def run_llama2_7b_int8():
|
|||
token_per_sec, memory_bandwidth = run_experiment(model)
|
||||
return [
|
||||
Experiment(
|
||||
"llama2_7b_int8",
|
||||
model.name,
|
||||
"token_per_sec",
|
||||
model.token_per_sec,
|
||||
f"{token_per_sec:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
),
|
||||
Experiment(
|
||||
"llama2_7b_int8",
|
||||
model.name,
|
||||
"memory_bandwidth(GB/s)",
|
||||
model.memory_bandwidth,
|
||||
f"{memory_bandwidth:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
|
||||
def run_mixtral_8x7b_int8():
|
||||
def run_mixtral_8x7b_int8(device: str = "cuda"):
|
||||
from benchmark import Experiment
|
||||
|
||||
# We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
|
||||
|
|
@ -294,15 +302,19 @@ def run_mixtral_8x7b_int8():
|
|||
token_per_sec, memory_bandwidth = run_experiment(model)
|
||||
return [
|
||||
Experiment(
|
||||
"mixtral_8x7b_int8",
|
||||
model.name,
|
||||
"token_per_sec",
|
||||
model.token_per_sec,
|
||||
f"{token_per_sec:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
),
|
||||
Experiment(
|
||||
"mixtral_8x7b_int8",
|
||||
model.name,
|
||||
"memory_bandwidth(GB/s)",
|
||||
model.memory_bandwidth,
|
||||
f"{memory_bandwidth:.02f}",
|
||||
model.mode,
|
||||
device,
|
||||
),
|
||||
]
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user