[cutlass backend] Add dynamo timed (#157410)

Differential Revision: [D77631592](https://our.internmc.facebook.com/intern/diff/D77631592/) Before: ![Screenshot 2025-07-01 at 4 08 06 PM](https://github.com/user-attachments/assets/8f6445aa-50c7-456f-b5ac-b2749eb9bf40) After (different run): ![Screenshot 2025-07-01 at 5 11 09 PM](https://github.com/user-attachments/assets/7513d312-c4dc-4e39-9718-c63eb641bc30) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157410 Approved by: https://github.com/jingsh
2025-12-06 12:20:52 +01:00 · 2025-07-01 17:13:59 -07:00 · 2025-07-01 17:13:59 -07:00 · b642a5c118
commit b642a5c118
parent 493f42a541
2 changed files with 17 additions and 10 deletions
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@ -13,6 +13,7 @@ from typing import Any, Optional
 import sympy

 import torch
+from torch._inductor.runtime.runtime_utils import dynamo_timed
 from torch._inductor.utils import clear_on_fresh_cache

 from ... import config
@ -278,9 +279,10 @@ def gen_ops() -> dict[Any, Any]:
    """
    Generates all supported CUTLASS operations.
    """
-    arch = get_cuda_arch()
-    version = get_cuda_version()
-    return _gen_ops_cached(arch, version)
+    with dynamo_timed("cutlass_utils.gen_ops"):
+        arch = get_cuda_arch()
+        version = get_cuda_version()
+        return _gen_ops_cached(arch, version)


 DTYPE_TO_CUTLASS_TYPE = {
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@ -11,6 +11,7 @@ from typing import Any, Optional, Union
 import torch
 import torch.utils._pytree as pytree
 from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
+from torch._inductor.runtime.runtime_utils import dynamo_timed
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.select_algorithm import create_inputs_key
 from torch._inductor.utils import clear_on_fresh_cache
@ -556,12 +557,15 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
        """

        ops = self.gen_ops()
-        for name, op in ops:
-            for swizzle in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
-                description = f"{name} swizzle={swizzle}"
-                self.maybe_append_choice(
-                    choices, description=description, op=op, swizzle=swizzle
-                )
+        with dynamo_timed("CUTLASSGemmTemplate.maybe_append_choice"):
+            for name, op in ops:
+                for (
+                    swizzle
+                ) in inductor_cuda_config.cutlass_max_profiling_swizzle_options:
+                    description = f"{name} swizzle={swizzle}"
+                    self.maybe_append_choice(
+                        choices, description=description, op=op, swizzle=swizzle
+                    )

        if len(ops) == 0:
            input_layouts = [node.get_layout() for node in input_nodes]
@ -940,7 +944,8 @@ class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
            log.debug("Using cached ops for %s", self.cache_key)
            return self.filtered_ops_cache[self.cache_key]

-        maybe_ops = maybe_fetch_ops()
+        with dynamo_timed("CUTLASSGemmTemplate.maybe_fetch_ops"):
+            maybe_ops = maybe_fetch_ops()
        if maybe_ops is None:
            log.debug("Cannot fetch ops from cache, generating ops from scratch")
            full_ops = cutlass_utils.gen_ops()