[cutlass backend] Add dynamo timed (#157410)

Differential Revision: [D77631592](https://our.internmc.facebook.com/intern/diff/D77631592/)

Before:
![Screenshot 2025-07-01 at 4 08 06 PM](https://github.com/user-attachments/assets/8f6445aa-50c7-456f-b5ac-b2749eb9bf40)

After (different run):
![Screenshot 2025-07-01 at 5 11 09 PM](https://github.com/user-attachments/assets/7513d312-c4dc-4e39-9718-c63eb641bc30)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157410
Approved by: https://github.com/jingsh
This commit is contained in:
henrylhtsang 2025-07-01 17:13:59 -07:00 committed by PyTorch MergeBot
parent 493f42a541
commit b642a5c118
2 changed files with 17 additions and 10 deletions

View File

@ -13,6 +13,7 @@ from typing import Any, Optional
import sympy
import torch
from torch._inductor.runtime.runtime_utils import dynamo_timed
from torch._inductor.utils import clear_on_fresh_cache
from ... import config
@ -278,9 +279,10 @@ def gen_ops() -> dict[Any, Any]:
"""
Generates all supported CUTLASS operations.
"""
arch = get_cuda_arch()
version = get_cuda_version()
return _gen_ops_cached(arch, version)
with dynamo_timed("cutlass_utils.gen_ops"):
arch = get_cuda_arch()
version = get_cuda_version()
return _gen_ops_cached(arch, version)
DTYPE_TO_CUTLASS_TYPE = {

View File

@ -11,6 +11,7 @@ from typing import Any, Optional, Union
import torch
import torch.utils._pytree as pytree
from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
from torch._inductor.runtime.runtime_utils import dynamo_timed
from torch._inductor.scheduler import BaseSchedulerNode
from torch._inductor.select_algorithm import create_inputs_key
from torch._inductor.utils import clear_on_fresh_cache
@ -556,12 +557,15 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
"""
ops = self.gen_ops()
for name, op in ops:
for swizzle in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
description = f"{name} swizzle={swizzle}"
self.maybe_append_choice(
choices, description=description, op=op, swizzle=swizzle
)
with dynamo_timed("CUTLASSGemmTemplate.maybe_append_choice"):
for name, op in ops:
for (
swizzle
) in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
description = f"{name} swizzle={swizzle}"
self.maybe_append_choice(
choices, description=description, op=op, swizzle=swizzle
)
if len(ops) == 0:
input_layouts = [node.get_layout() for node in input_nodes]
@ -940,7 +944,8 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
log.debug("Using cached ops for %s", self.cache_key)
return self.filtered_ops_cache[self.cache_key]
maybe_ops = maybe_fetch_ops()
with dynamo_timed("CUTLASSGemmTemplate.maybe_fetch_ops"):
maybe_ops = maybe_fetch_ops()
if maybe_ops is None:
log.debug("Cannot fetch ops from cache, generating ops from scratch")
full_ops = cutlass_utils.gen_ops()