[Inductor] Add Subgraph as a Autotuning Choice (#150653)

Add the option for providing a Subgraph as an autotuning choice in Inductor. This is crucial for implementing the split-k optimization for GEMMs by decomposing a mm -> bmm. https://github.com/pytorch/pytorch/pull/150654 uses these changes to add decomposeK as a default autotuning choice for aten.mm in Inductor. Using https://github.com/pytorch/pytorch/pull/150654 and a simple script: ``` import torch def f(a, b): return torch.matmul(a, b) def decompose_func(a_in, b_in): M, K = a_in.shape K, N = b_in.shape # TODO: Ideally we want to autotune over this parameter kPartitions = 256 assert K % kPartitions == 0, "K must be divisible by Kmini" B = K // kPartitions a_reshaped = a_in.reshape(M, B, kPartitions).transpose( 0, 1 ) # Shape: (B, M, kPartitions) b_reshaped = b_in.reshape(B, kPartitions, N) # Shape: (B, kPartitions, N) result = torch.bmm(a_reshaped, b_reshaped) # Shape: (B, M, N) return result.sum(dim=0).to(torch.float16) # Sum over B dimension, Shape: (M, N) for k in [4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768]: a = torch.randn(32, k, dtype=torch.float16, device="cuda", requires_grad=True) b = torch.randn(k, 32, dtype=torch.float16, device="cuda", requires_grad=True) compiled_res = torch.compile(f, dynamic=False)(a, b) decompose_res = decompose_func(a, b) print(f"Compiled mm result close to aten: {torch.allclose(f(a, b), compiled_res, atol=1e-5, rtol=0.5)}") print(f"Compiled mm result close to decompose: {torch.allclose(decompose_res, compiled_res, atol=1e-5, rtol=0.5)}") ``` we are able to autotune the decomposeK optimization to aten and the traditional Triton templates in Inductor. DecomposeK is faster than aten by about ~10% on average and > 4x speedup over the best Triton templates on an H100 machine, e.g.: ``` AUTOTUNE mm(32x28672, 28672x32) decompose_k_mm 0.0126 ms 100.0% mm 0.0144 ms 87.5% triton_mm_69 0.0579 ms 21.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 triton_mm_75 0.0677 ms 18.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 triton_mm_76 0.0850 ms 14.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 triton_mm_68 0.1444 ms 8.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 triton_mm_72 0.1546 ms 8.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 triton_mm_74 0.1819 ms 6.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 triton_mm_67 0.1917 ms 6.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4 triton_mm_73 0.2766 ms 4.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 ``` https://pastebin.com/g3FMaauT is the generated code from Inductor containing the subgraph decomposition for aten.mm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/150653 Approved by: https://github.com/eellison
2025-12-06 12:20:52 +01:00 · 2025-04-11 07:43:35 -07:00 · 2025-04-11 07:43:35 -07:00 · 83ae61fd8e
commit 83ae61fd8e
parent ad5e9065ac
3 changed files with 318 additions and 0 deletions
--- a/test/inductor/test_subgraph_choice.py
+++ b/test/inductor/test_subgraph_choice.py
@ -0,0 +1,118 @@
+# Owner(s): ["module: inductor"]
+import functools
+
+import torch
+from torch._dispatch.python import enable_python_dispatcher
+from torch._inductor.codegen.subgraph import SubgraphTemplate
+from torch._inductor.decomposition import select_decomp_table
+from torch._inductor.ir import Buffer, FixedLayout
+from torch._inductor.lowering import register_lowering
+from torch._inductor.select_algorithm import (
+    AlgorithmSelectorCache,
+    autotune_select_algorithm,
+)
+from torch._inductor.test_case import run_tests, TestCase
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
+
+
+class TestSubgraphChoice(TestCase):
+    def setUp(self):
+        super().setUp()
+
+    def _create_buffer(self, name, shape, dtype):
+        return Buffer(
+            name=name,
+            layout=FixedLayout(torch.device(f"{GPU_TYPE}:0"), dtype=dtype, size=shape),
+        )
+
+    def test_subgraph_decompose_k(self):
+        from torch._inductor.kernel.mm import aten_mm
+        from torch._inductor.kernel.mm_common import mm_args
+
+        @torch.library.custom_op("mylib::matmul_decompose", mutates_args={})
+        def matmul_decompose(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a @ b
+
+        @matmul_decompose.register_fake
+        def _(a, b):
+            return a @ b
+
+        def decomposeK(a, b, kPartitions):
+            m = a.shape[0]
+            n = b.shape[1]
+            k = a.shape[1]
+
+            B = k // kPartitions
+            a_reshaped = torch.permute(a.reshape(m, B, kPartitions), (1, 0, 2))
+            b_reshaped = b.reshape(B, kPartitions, n)
+            result = torch.bmm(a_reshaped, b_reshaped)
+            result_fp32 = result.to(torch.float32)
+            reduced_buf = torch.sum(result_fp32, 0)
+            return reduced_buf.to(a.dtype)
+
+        mat1_shape, mat2_shape = (32, 4096), (4096, 32)
+
+        @register_lowering(torch.ops.mylib.matmul_decompose)
+        def _(a, b):
+            _, _, _, layout, mat1, mat2 = mm_args(a, b)
+
+            choices = [aten_mm.bind((mat1, mat2), layout)]
+
+            # TODO (PaulZhang12): Once decomposeK lands in Inductor, move this
+            kPartitions = 256
+            with enable_python_dispatcher():
+                decompositions = select_decomp_table()
+
+                decompose_k_subgraph_template = SubgraphTemplate(
+                    name="decompose_k_mm",
+                    make_fx_graph=make_fx(
+                        functools.partial(decomposeK, kPartitions=kPartitions),
+                        decompositions,
+                        tracing_mode="real",
+                    ),
+                )
+
+            mat1_tensor, mat2_tensor = (
+                AlgorithmSelectorCache.benchmark_example_value(mat1),
+                AlgorithmSelectorCache.benchmark_example_value(mat2),
+            )
+            decompose_k_subgraph_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                example_inputs=[mat1_tensor, mat2_tensor],
+            )
+
+            # Test benchmarking against aten
+            autotune_select_algorithm("test_subgraph_choice", choices, [a, b], layout)
+
+            # Only return decomposeK case for codegen
+            choices = [choices[1]]
+            return autotune_select_algorithm(
+                "test_subgraph_choice", choices, [a, b], layout
+            )
+
+        a_in = torch.randn(
+            mat1_shape, dtype=torch.float16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+        b_in = torch.randn(
+            mat2_shape, dtype=torch.float16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+
+        def func(mat1, mat2):
+            return torch.ops.mylib.matmul_decompose(mat1, mat2)
+
+        compiled_func = torch.compile(func, mode="max-autotune", dynamic=False)
+
+        res = compiled_func(a_in, b_in)
+
+        # Check same results of compiled result and regular torch.mm
+        # Relax precision as decomposeK does first accumulation in fp16
+        torch.testing.assert_close(res, a_in @ b_in, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    # Set env to make it work in CI.
+    if HAS_GPU and HAS_CPU:
+        run_tests()
--- a/torch/_inductor/codegen/subgraph.py
+++ b/torch/_inductor/codegen/subgraph.py
@ -0,0 +1,157 @@
+import logging
+from typing import Any, Callable
+
+import torch
+from torch._inductor import ir
+from torch._inductor.codegen.common import KernelTemplate
+from torch._inductor.ir import Buffer, Layout
+from torch._inductor.runtime.benchmarking import benchmarker
+from torch._inductor.virtualized import V
+
+
+log = logging.getLogger(__name__)
+
+
+class SubgraphChoiceCaller(ir.ChoiceCaller):
+    """
+    Represents a Subgraph Autotuning choice, and the subgraph can be any arbitrary
+    GraphModule. Compiles the Subgraph down to a module for benchmarking.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        description: str,
+        gm: torch.fx.GraphModule,
+        example_inputs: list[Any],
+    ) -> None:
+        super().__init__(name, input_nodes, layout, description)
+        self.gm = gm
+        self.example_inputs = example_inputs
+
+    def __str__(self) -> str:
+        return f"SubgraphCaller({self.name})"
+
+    def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
+        # Codegen Subgraph for benchmarking
+        # Need GraphLowering instead of SubgraphLowering to generate
+        # fully callable module
+        import torch._inductor.config as inductor_config
+        from torch._inductor.graph import GraphLowering
+
+        bm_graph_lowering = GraphLowering(
+            gm=self.gm,
+            example_inputs=self.example_inputs,
+            shape_env=V.graph._shape_env,
+            cpp_wrapper=V.graph.cpp_wrapper,
+            aot_mode=V.graph.aot_mode,
+            extern_node_serializer=V.graph.extern_node_serializer,
+            is_inference=V.graph.is_inference,
+            is_backward=V.graph.is_backward,
+            name=f"benchmark_{self.name}",
+        )
+
+        with V.set_graph_handler(bm_graph_lowering):
+            # Don't bother autotuning on Triton here
+            with inductor_config.patch(
+                max_autotune=False,
+                max_autotune_gemm=False,
+                max_autotune_gemm_backends="ATEN",
+            ):
+                bm_graph_lowering.run(*self.example_inputs)
+                mod = bm_graph_lowering.compile_to_module()
+                bm_func = mod.call
+                bm_func([*args])
+
+        return benchmarker.benchmark_gpu(lambda: bm_func([*args]))
+
+    def hash_key(self) -> str:
+        return "-".join(
+            [
+                self.name,
+                *[
+                    str(arg.shape)
+                    for arg in self.example_inputs
+                    if isinstance(arg, torch.Tensor)
+                ],
+                str(self.gm.graph),
+            ]
+        )
+
+    def output_node(self) -> ir.TensorBox:
+        return ir.TensorBox.create(
+            ir.SubgraphBuffer(
+                layout=self.layout,
+                input_nodes=self.input_nodes,
+                gm=self.gm,
+                example_inputs=self.example_inputs,
+                subgraph_name=self.name,
+            )
+        )
+
+    def info_dict(self) -> dict[str, Any]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {
+            "backend": "subgraph",
+            "kernel_name": self.name,
+        }
+
+    def autoheuristic_id(self) -> str:
+        return f"subgraph_{self.name}"
+
+
+class SubgraphTemplate(KernelTemplate):
+    """
+    A template for subgraph evaluation to be used in autotuning.
+
+    This class allows creating customized subgraphs that can be appended
+    as choices during the autotuning process, enabling the selection of
+    optimal implementations for complex operations.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        make_fx_graph: Callable[..., Any],
+    ):
+        """
+        Initialize a subgraph template.
+
+        Args:
+            name: The name of this template
+            graph: The FX graph
+        """
+        self.name = name
+        self.make_fx_graph = make_fx_graph
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        example_inputs: list[Any],
+        **kwargs: Any,
+    ) -> SubgraphChoiceCaller:
+        """
+        Generate a SubgraphChoiceCaller instance for autotuning.
+
+        Args:
+            input_nodes: List of input nodes to the subgraph
+            layout: Memory layout information for the output
+            example_inputs: Example tensor inputs used to trace and benchmark the subgraph
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            SubgraphChoiceCaller: A callable object that can be used for autotuning
+        """
+        gm = self.make_fx_graph(*example_inputs)
+
+        return SubgraphChoiceCaller(
+            name=self.name,
+            input_nodes=input_nodes,
+            layout=layout,
+            description="",
+            gm=gm,
+            example_inputs=example_inputs,
+        )
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@ -5999,6 +5999,49 @@ class TMADescriptor(ExternKernel):
        wrapper.generate_tma_descriptor(self)


+class SubgraphBuffer(ExternKernel):
+    def __init__(
+        self,
+        layout: Layout,
+        input_nodes: list[Buffer],
+        gm: torch.fx.GraphModule,
+        example_inputs: list[Any],
+        subgraph_name: str,
+    ):
+        super().__init__(None, layout, input_nodes)
+        self.gm = gm
+        self.example_inputs = example_inputs
+        self.name = V.graph.register_buffer(self)
+        V.graph.register_operation(self)
+
+        self.subgraph = V.graph.make_subgraph(
+            self.gm, self.example_inputs, subgraph_name
+        )
+
+        import torch._inductor.config as inductor_config
+
+        with V.set_graph_handler(self.subgraph):
+            # Don't bother autotuning on Triton here
+            with inductor_config.patch(  # type: ignore[no-untyped-def]
+                max_autotune=False,
+                max_autotune_gemm=False,
+                max_autotune_gemm_backends="ATEN",
+            ):
+                self.subgraph.run(*self.example_inputs)
+
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        class CodegenGraph:
+            def __init__(self, graph: GraphLowering):
+                self.graph = graph
+                self.name = graph.name
+
+        wrapper.codegen_subgraph(
+            CodegenGraph(self.subgraph),
+            [*[buffer.get_name() for buffer in self.inputs]],
+            [self.name],
+        )
+
+
 class UserDefinedTritonKernel(ExternKernel):
    def get_kernel_and_metadata(self):  # type: ignore[no-untyped-def]
        from triton.runtime.autotuner import Autotuner