[Inductor]Cleanup autotune_fallback_to_aten post-deprecation (#154331)

Fixes #153298 This PR is the 3rd and final step of #147479 All references to autotune_fallback_to_aten have been removed, and the feature is now deprecated. All calls to should_fallback_to_aten() were also removed, as they were deemed unnecessary. [henrylhtsang](https://github.com/henrylhtsang) Pull Request resolved: https://github.com/pytorch/pytorch/pull/154331 Approved by: https://github.com/henrylhtsang
2025-12-06 12:20:52 +01:00 · 2025-05-29 20:29:54 +00:00 · 2025-05-29 20:29:54 +00:00 · cb56df55dc
commit cb56df55dc
parent 629fca295e
8 changed files with 5 additions and 122 deletions
--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@ -80,7 +80,6 @@ def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) ->
@dataclass(frozen=True, kw_only=True)
 class ExperimentConfig:
    autotune_fallback_to_aten: bool = False
    max_autotune: bool = True
    coordinate_descent_tuning: bool = True
    max_autotune_gemm_backends: str = "ATEN"
@ -91,7 +90,6 @@ class ExperimentConfig:
    def to_options(self) -> dict[str, Any]:
        return {
            "autotune_fallback_to_aten": self.autotune_fallback_to_aten,
            "max_autotune": self.max_autotune,
            "coordinate_descent_tuning": self.coordinate_descent_tuning,
            "max_autotune_gemm_backends": self.max_autotune_gemm_backends,
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@ -129,7 +129,6 @@ use_evt_config = config.patch(
        "max_autotune": True,
        "max_autotune_gemm_backends": "CUTLASS",
        "cuda.cutlass_max_profiling_configs": 1,
        "autotune_fallback_to_aten": False,
        "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
        "cuda.cutlass_tma_only": True,
        "cuda.cutlass_epilogue_fusion_enabled": True,
@ -141,7 +140,6 @@ fp8_config = config.patch(
        "max_autotune": True,
        "max_autotune_gemm_backends": "CUTLASS",
        "cuda.cutlass_max_profiling_configs": 1,
        "autotune_fallback_to_aten": False,
        "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
        "cuda.cutlass_tma_only": True,
    }
@ -269,7 +267,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
                "cuda.cutlass_max_profiling_configs": 4,
                "autotune_fallback_to_aten": False,
            }
        ):
            Y_compiled = torch.compile(torch.mm)(a, b)
@ -308,7 +305,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
                "cuda.cutlass_max_profiling_configs": 4,
                "autotune_fallback_to_aten": False,
            }
        ):
            for x_shape in x_shapes:
@ -336,7 +332,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune_gemm_backends": "CUTLASS",
                "compile_threads": 4,
                "cuda.cutlass_max_profiling_configs": 4,
                "autotune_fallback_to_aten": False,
            }
        ):
            Y_compiled = torch.compile(torch.bmm)(a, b)
@ -369,7 +364,6 @@ class TestCutlassBackend(TestCase):
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 1,
                "autotune_fallback_to_aten": False,
            }
        ):
            from torch._inductor.utils import run_and_get_code
@ -412,7 +406,6 @@ class TestCutlassBackend(TestCase):
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 1,
                "autotune_fallback_to_aten": False,
                "cuda.cutlass_max_profiling_swizzle_options": [
                    1,
                    2,
@ -482,7 +475,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
            }
        ), dynamo_config.patch({"error_on_recompile": dynamic}):
            expected = [model(*input) for input in inputs]
@ -560,7 +552,6 @@ class TestCutlassBackend(TestCase):
                    "max_autotune": True,
                    "max_autotune_gemm_backends": max_autotune_gemm_backends,
                    "cuda.cutlass_max_profiling_configs": 2,
                    "autotune_fallback_to_aten": False,
                }
            ), dynamo_config.patch({"error_on_recompile": dynamic}):
                expected = [model(*input) for input in inputs]
@ -622,7 +613,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
            }
        ):
            expected = [model(*input) for input in inputs]
@ -657,7 +647,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 2,
                "cuda.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
                "autotune_fallback_to_aten": False,
            }
        ):
            for M, K, N in (
@ -713,7 +702,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 4,
                "cuda.version": "12.2",  # required to enable the Kernels we need
                "autotune_fallback_to_aten": False,
            }
        ):
            counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
@ -812,7 +800,6 @@ class TestCutlassBackend(TestCase):
                "autotune_in_subproc": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
            }
        ):
            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
@ -831,7 +818,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": False,
                "max_autotune_gemm_backends": "CUTLASS",
                "autotune_fallback_to_aten": False,
                "cuda.cutlass_max_profiling_configs": 2,
            }
        ):
@ -870,7 +856,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": False,
                "max_autotune_gemm_backends": "CUTLASS",
                "autotune_fallback_to_aten": False,
                "cuda.cutlass_max_profiling_configs": 2,
            }
        ):
@ -904,7 +889,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "autotune_in_subproc": False,
                "max_autotune_gemm_backends": "CUTLASS",
                "autotune_fallback_to_aten": False,
                "cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
                "cuda.cutlass_max_profiling_configs": 1,
            }
@ -951,7 +935,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune_gemm_backends": "CUTLASS",
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_local_cache": True,
                "autotune_fallback_to_aten": False,
            }
        ):
            Y_compiled = torch.compile(mm, dynamic=dynamic)(a_sparse, b)
@ -1107,7 +1090,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
            }
        ), mock.patch(
            "torch._inductor.kernel.mm.autotune_select_algorithm",
@ -1166,7 +1148,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
                "cuda.cutlass_presets": presets,
            }
        ), mock.patch(
@ -1272,7 +1253,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
                "cuda.generate_test_runner": True,  # put standalone runner in the generated code
            }
        ):
@ -1389,7 +1369,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": max_autotune_gemm_backends,
                "cuda.cutlass_max_profiling_configs": 2,
                "autotune_fallback_to_aten": False,
            }
        ):
            compiled = torch.compile(torch.mm)
@ -1413,7 +1392,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "cuda.cutlass_max_profiling_configs": 1,
                "autotune_fallback_to_aten": False,
            }
        ):
            _ = torch.compile(model)(B)
@ -1436,7 +1414,6 @@ class TestCutlassBackend(TestCase):
                "max_autotune": True,
                "max_autotune_gemm_backends": "CUTLASS",
                "cuda.cutlass_max_profiling_configs": 1,
                "autotune_fallback_to_aten": False,
            }
        ):
            _ = torch.compile(model)(B)
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@ -139,7 +139,6 @@ class TestMaxAutotune(TestCase):
        with config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": "1",
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
@ -164,7 +163,6 @@ class TestMaxAutotune(TestCase):
        with self.assertRaises(BackendCompilerFailed) as context, config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": "1",
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
@ -198,7 +196,6 @@ class TestMaxAutotune(TestCase):
        with config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": "1",
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
@ -259,7 +256,6 @@ class TestMaxAutotune(TestCase):
        with config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": "1",
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
@ -285,7 +281,6 @@ class TestMaxAutotune(TestCase):
        with self.assertRaises(BackendCompilerFailed) as context, config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": "1",
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
@ -321,7 +316,6 @@ class TestMaxAutotune(TestCase):
        with config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": "1",
                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
            }
@ -380,7 +374,6 @@ class TestMaxAutotune(TestCase):
        with config.patch(
            {
                "max_autotune": True,
                "autotune_fallback_to_aten": False,
                "triton.enable_persistent_tma_matmul": True,
                "max_autotune_gemm_backends": "TRITON",
                "test_configs.autotune_choice_name_regex": "tma",
@ -868,7 +861,6 @@ class TestMaxAutotune(TestCase):
    @config.patch(
        max_autotune=True,
        max_autotune_gemm_backends="",
        autotune_fallback_to_aten=False,
    )
    def test_no_valid_choices(self):
        a = torch.zeros([2, 2], device=GPU_TYPE)
@ -881,7 +873,6 @@ class TestMaxAutotune(TestCase):
    @config.patch(
        max_autotune=True,
        max_autotune_gemm_backends="TRITON",
        autotune_fallback_to_aten=False,
    )
    def test_inf_timing(self, multi_template):
        from unittest.mock import patch
@ -955,7 +946,6 @@ class TestMaxAutotune(TestCase):
    @config.patch(
        max_autotune=True,
        max_autotune_gemm_backends="TRITON",
        autotune_fallback_to_aten=False,
    )
    def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
        fp16_red_setting = (
@ -1058,7 +1048,6 @@ class TestMaxAutotune(TestCase):
    @config.patch(
        max_autotune=True,
        max_autotune_gemm_backends="TRITON",
        autotune_fallback_to_aten=False,
    )
    def test_max_autotune_decompose_k_dynamic_input(self):
        def f(a, b):
@ -1106,7 +1095,6 @@ class TestMaxAutotune(TestCase):
    @config.patch(
        max_autotune=True,
        max_autotune_gemm_backends="TRITON",
        autotune_fallback_to_aten=False,
    )
    def test_max_autotune_decompose_k_output_stride(self):
        def f(a, b):
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -436,11 +436,8 @@ max_autotune_gemm_search_space: Literal["DEFAULT", "EXHAUSTIVE"] = os.environ.ge
    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE", "DEFAULT"
 ).upper()  # type: ignore[assignment]
-# NOTE: This feature is deprecated and will be defauled to False in the future.
+# DEPRECATED. This setting is ignored.
-# Whether we fall back to ATen or hard error when no matches are found during autotuning
+autotune_fallback_to_aten = False
 autotune_fallback_to_aten = (
    os.environ.get("TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN", "0") == "1"
 )
 # the value used as a fallback for the unbacked SymInts
 # that can appear in the input shapes (e.g., in autotuning)
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@ -27,7 +27,6 @@ from .mm_common import (
    mm_args,
    mm_config_kwargs,
    mm_options,
    should_fallback_to_aten,
 )
@ -233,9 +232,6 @@ def tuned_bmm(mat1, mat2, out_dtype=None, *, layout=None):
    if use_ck_gemm_template(layout, m, n, k):
        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
    if should_fallback_to_aten(choices):
        choices.append(aten_bmm.bind((mat1, mat2), layout))
    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@ -60,7 +60,6 @@ from .mm_common import (
    persistent_mm_options,
    scale_mm_epilogue,
    scaled_mm_options,
    should_fallback_to_aten,
 )
@ -783,8 +782,6 @@ def tuned_mm(mat1, mat2, *, layout=None):
    for k in inductor_config.external_matmul:
        choices.append(lazy_register_extern_choice(k).bind((mat1, mat2), layout))
    if should_fallback_to_aten(choices):
        return aten_mm.bind((mat1, mat2), aten_layout).output_node()
    return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
@ -834,15 +831,11 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
                **mm_options(config, m, n, k, layout),
            )
    if should_fallback_to_aten(choices):
        return aten__int_mm.bind((mat1, mat2), layout).output_node()
    return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
@register_lowering(aten.addmm, type_promotion_kind=None)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
    ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
    device_type = ir.get_device_type(mat1)
    m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
    static_shape, is_nonzero = _is_static_problem(layout)
@ -973,30 +966,6 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
            has_bias=True,
        )
    if should_fallback_to_aten(choices):
        choices.append(
            aten_addmm.bind(
                (inp_expanded, mat1, mat2),
                layout,
                ordered_kwargs_for_cpp_kernel,
                alpha=alpha,
                beta=beta,
            )
        )
        if (
            inp_expanded.get_stride()[0] == 0
            and inp_expanded.get_device().type == "cuda"
            and inductor_config.triton.autotune_cublasLt
        ):
            # unexpand inp to make sure fused addmm from cublasLt is used
            choices.insert(
                0,
                aten_bias_addmm.bind(
                    (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
                ),
            )
    return autotune_select_algorithm(
        "addmm", choices, [inp_expanded, mat1, mat2], layout
    )
@ -1198,9 +1167,6 @@ def tuned_scaled_mm(
    if is_nonzero and use_ck_gemm_template(layout, m, n, k):
        CKGemmTemplate.add_ck_gemm_choices(choices, layout, input_nodes)
    if should_fallback_to_aten(choices):
        return aten_choice.output_node()
    return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@ -12,32 +12,13 @@ from torch._inductor.virtualized import V
 from .. import config as inductor_config
 from ..codegen.wrapper import PythonWrapperCodegen
-from ..ir import _IntLike, ChoiceCaller, Layout, TensorBox
+from ..ir import _IntLike, Layout, TensorBox
-from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE, use_aten_gemm_kernels
+from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE
 log = logging.getLogger(__name__)
 def should_fallback_to_aten(choices: list[ChoiceCaller]) -> bool:
    if len(choices) == 0 and not use_aten_gemm_kernels():
        if inductor_config.autotune_fallback_to_aten:
            log.warning(
                "No choices for GEMM, using ATen backend as fallback. "
                "This behavior is being deprecated. Please add include Aten in max_autotune_gemm_backends."
            )
            return True
        else:
            log.warning(
                "No choices for GEMM, chose not to fallback to ATen backend. "
                "To temporarily change this behavior, set autotune_fallback_to_aten to True "
                "via TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN=1, but this knob is being deprecated. "
                "The long term fix is to include Aten in max_autotune_gemm_backends."
            )
            return False
    return False
@SymbolicGridFn
 def mm_grid(m, n, meta, *, cdiv):
    """
--- a/torch/_inductor/quantized_lowerings.py
+++ b/torch/_inductor/quantized_lowerings.py
@ -4,7 +4,7 @@ from typing import Any
 import torch
 from torch._inductor.kernel.mm_common import mm_args
-from . import config as inductor_config, lowering
+from . import lowering
 from .codegen.cpp_gemm_template import CppGemmTemplate, CppWoqInt4GemmTemplate
 from .codegen.cpp_utils import create_epilogue_with_attr
 from .lowering import expand, register_lowering
@ -90,16 +90,6 @@ def register_woq_mm_ops() -> None:
                epilogue_creator=_mul_epilogue,  # type: ignore[arg-type]
            )
        if (
            len(choices) == 0
            and inductor_config.autotune_fallback_to_aten
            and not use_aten_gemm_kernels()
        ):
            log.warning("No choices for GEMM, using ATen backend as fallback")
            return aten__weight_int8pack_mm.bind(
                (mat1, mat2, scale), aten_layout
            ).output_node()
        return autotune_select_algorithm(
            "_weight_int8pack_mm", choices, [mat1, mat2, scale], aten_layout
        )
@ -153,16 +143,6 @@ def register_woq_mm_ops() -> None:
                [mat1, mat2, group_size, qScaleAndZeros],
            )
        if (
            len(choices) == 0
            and inductor_config.autotune_fallback_to_aten
            and not use_aten_gemm_kernels()
        ):
            log.warning("No choices for GEMM, using ATen backend as fallback")
            return aten__weight_int4pack_mm_cpu.bind(
                (mat1, mat2, group_size, qScaleAndZeros), aten_layout
            ).output_node()
        # define functions to generate example inputs for weight and group size
        # otherwise, autotuner generates example inputs of all zeros for them
        def get_example_weight(x: torch._inductor.ir.IRNode) -> torch.Tensor: