[Inductor]Cleanup autotune_fallback_to_aten post-deprecation (#154331)

Fixes #153298

This PR is the 3rd and final step of #147479
All references to autotune_fallback_to_aten have been removed, and the feature is now deprecated.
All calls to should_fallback_to_aten() were also removed, as they were deemed unnecessary.

[henrylhtsang](https://github.com/henrylhtsang)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/154331
Approved by: https://github.com/henrylhtsang
This commit is contained in:
Joaquin 2025-05-29 20:29:54 +00:00 committed by PyTorch MergeBot
parent 629fca295e
commit cb56df55dc
8 changed files with 5 additions and 122 deletions

View File

@ -80,7 +80,6 @@ def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) ->
@dataclass(frozen=True, kw_only=True)
class ExperimentConfig:
autotune_fallback_to_aten: bool = False
max_autotune: bool = True
coordinate_descent_tuning: bool = True
max_autotune_gemm_backends: str = "ATEN"
@ -91,7 +90,6 @@ class ExperimentConfig:
def to_options(self) -> dict[str, Any]:
return {
"autotune_fallback_to_aten": self.autotune_fallback_to_aten,
"max_autotune": self.max_autotune,
"coordinate_descent_tuning": self.coordinate_descent_tuning,
"max_autotune_gemm_backends": self.max_autotune_gemm_backends,

View File

@ -129,7 +129,6 @@ use_evt_config = config.patch(
"max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
"cuda.cutlass_tma_only": True,
"cuda.cutlass_epilogue_fusion_enabled": True,
@ -141,7 +140,6 @@ fp8_config = config.patch(
"max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
"cuda.cutlass_tma_only": True,
}
@ -269,7 +267,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS",
"compile_threads": 4,
"cuda.cutlass_max_profiling_configs": 4,
"autotune_fallback_to_aten": False,
}
):
Y_compiled = torch.compile(torch.mm)(a, b)
@ -308,7 +305,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS",
"compile_threads": 4,
"cuda.cutlass_max_profiling_configs": 4,
"autotune_fallback_to_aten": False,
}
):
for x_shape in x_shapes:
@ -336,7 +332,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS",
"compile_threads": 4,
"cuda.cutlass_max_profiling_configs": 4,
"autotune_fallback_to_aten": False,
}
):
Y_compiled = torch.compile(torch.bmm)(a, b)
@ -369,7 +364,6 @@ class TestCutlassBackend(TestCase):
"autotune_in_subproc": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
}
):
from torch._inductor.utils import run_and_get_code
@ -412,7 +406,6 @@ class TestCutlassBackend(TestCase):
"autotune_in_subproc": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
"cuda.cutlass_max_profiling_swizzle_options": [
1,
2,
@ -482,7 +475,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
}
), dynamo_config.patch({"error_on_recompile": dynamic}):
expected = [model(*input) for input in inputs]
@ -560,7 +552,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
}
), dynamo_config.patch({"error_on_recompile": dynamic}):
expected = [model(*input) for input in inputs]
@ -622,7 +613,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
}
):
expected = [model(*input) for input in inputs]
@ -657,7 +647,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
"autotune_fallback_to_aten": False,
}
):
for M, K, N in (
@ -713,7 +702,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 4,
"cuda.version": "12.2", # required to enable the Kernels we need
"autotune_fallback_to_aten": False,
}
):
counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
@ -812,7 +800,6 @@ class TestCutlassBackend(TestCase):
"autotune_in_subproc": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
}
):
Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
@ -831,7 +818,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"autotune_in_subproc": False,
"max_autotune_gemm_backends": "CUTLASS",
"autotune_fallback_to_aten": False,
"cuda.cutlass_max_profiling_configs": 2,
}
):
@ -870,7 +856,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"autotune_in_subproc": False,
"max_autotune_gemm_backends": "CUTLASS",
"autotune_fallback_to_aten": False,
"cuda.cutlass_max_profiling_configs": 2,
}
):
@ -904,7 +889,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"autotune_in_subproc": False,
"max_autotune_gemm_backends": "CUTLASS",
"autotune_fallback_to_aten": False,
"cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
"cuda.cutlass_max_profiling_configs": 1,
}
@ -951,7 +935,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 2,
"autotune_local_cache": True,
"autotune_fallback_to_aten": False,
}
):
Y_compiled = torch.compile(mm, dynamic=dynamic)(a_sparse, b)
@ -1107,7 +1090,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
}
), mock.patch(
"torch._inductor.kernel.mm.autotune_select_algorithm",
@ -1166,7 +1148,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
"cuda.cutlass_presets": presets,
}
), mock.patch(
@ -1272,7 +1253,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
"cuda.generate_test_runner": True, # put standalone runner in the generated code
}
):
@ -1389,7 +1369,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
}
):
compiled = torch.compile(torch.mm)
@ -1413,7 +1392,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
}
):
_ = torch.compile(model)(B)
@ -1436,7 +1414,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
}
):
_ = torch.compile(model)(B)

View File

@ -139,7 +139,6 @@ class TestMaxAutotune(TestCase):
with config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
}
@ -164,7 +163,6 @@ class TestMaxAutotune(TestCase):
with self.assertRaises(BackendCompilerFailed) as context, config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
}
@ -198,7 +196,6 @@ class TestMaxAutotune(TestCase):
with config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
}
@ -259,7 +256,6 @@ class TestMaxAutotune(TestCase):
with config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
}
@ -285,7 +281,6 @@ class TestMaxAutotune(TestCase):
with self.assertRaises(BackendCompilerFailed) as context, config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
}
@ -321,7 +316,6 @@ class TestMaxAutotune(TestCase):
with config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma",
}
@ -380,7 +374,6 @@ class TestMaxAutotune(TestCase):
with config.patch(
{
"max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": True,
"max_autotune_gemm_backends": "TRITON",
"test_configs.autotune_choice_name_regex": "tma",
@ -868,7 +861,6 @@ class TestMaxAutotune(TestCase):
@config.patch(
max_autotune=True,
max_autotune_gemm_backends="",
autotune_fallback_to_aten=False,
)
def test_no_valid_choices(self):
a = torch.zeros([2, 2], device=GPU_TYPE)
@ -881,7 +873,6 @@ class TestMaxAutotune(TestCase):
@config.patch(
max_autotune=True,
max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
)
def test_inf_timing(self, multi_template):
from unittest.mock import patch
@ -955,7 +946,6 @@ class TestMaxAutotune(TestCase):
@config.patch(
max_autotune=True,
max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
)
def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
fp16_red_setting = (
@ -1058,7 +1048,6 @@ class TestMaxAutotune(TestCase):
@config.patch(
max_autotune=True,
max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
)
def test_max_autotune_decompose_k_dynamic_input(self):
def f(a, b):
@ -1106,7 +1095,6 @@ class TestMaxAutotune(TestCase):
@config.patch(
max_autotune=True,
max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
)
def test_max_autotune_decompose_k_output_stride(self):
def f(a, b):

View File

@ -436,11 +436,8 @@ max_autotune_gemm_search_space: Literal["DEFAULT", "EXHAUSTIVE"] = os.environ.ge
"TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE", "DEFAULT"
).upper() # type: ignore[assignment]
# NOTE: This feature is deprecated and will be defauled to False in the future.
# Whether we fall back to ATen or hard error when no matches are found during autotuning
autotune_fallback_to_aten = (
os.environ.get("TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN", "0") == "1"
)
# DEPRECATED. This setting is ignored.
autotune_fallback_to_aten = False
# the value used as a fallback for the unbacked SymInts
# that can appear in the input shapes (e.g., in autotuning)

View File

@ -27,7 +27,6 @@ from .mm_common import (
mm_args,
mm_config_kwargs,
mm_options,
should_fallback_to_aten,
)
@ -233,9 +232,6 @@ def tuned_bmm(mat1, mat2, out_dtype=None, *, layout=None):
if use_ck_gemm_template(layout, m, n, k):
CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
if should_fallback_to_aten(choices):
choices.append(aten_bmm.bind((mat1, mat2), layout))
return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)

View File

@ -60,7 +60,6 @@ from .mm_common import (
persistent_mm_options,
scale_mm_epilogue,
scaled_mm_options,
should_fallback_to_aten,
)
@ -783,8 +782,6 @@ def tuned_mm(mat1, mat2, *, layout=None):
for k in inductor_config.external_matmul:
choices.append(lazy_register_extern_choice(k).bind((mat1, mat2), layout))
if should_fallback_to_aten(choices):
return aten_mm.bind((mat1, mat2), aten_layout).output_node()
return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
@ -834,15 +831,11 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
**mm_options(config, m, n, k, layout),
)
if should_fallback_to_aten(choices):
return aten__int_mm.bind((mat1, mat2), layout).output_node()
return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
@register_lowering(aten.addmm, type_promotion_kind=None)
def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
device_type = ir.get_device_type(mat1)
m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
static_shape, is_nonzero = _is_static_problem(layout)
@ -973,30 +966,6 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
has_bias=True,
)
if should_fallback_to_aten(choices):
choices.append(
aten_addmm.bind(
(inp_expanded, mat1, mat2),
layout,
ordered_kwargs_for_cpp_kernel,
alpha=alpha,
beta=beta,
)
)
if (
inp_expanded.get_stride()[0] == 0
and inp_expanded.get_device().type == "cuda"
and inductor_config.triton.autotune_cublasLt
):
# unexpand inp to make sure fused addmm from cublasLt is used
choices.insert(
0,
aten_bias_addmm.bind(
(inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
),
)
return autotune_select_algorithm(
"addmm", choices, [inp_expanded, mat1, mat2], layout
)
@ -1198,9 +1167,6 @@ def tuned_scaled_mm(
if is_nonzero and use_ck_gemm_template(layout, m, n, k):
CKGemmTemplate.add_ck_gemm_choices(choices, layout, input_nodes)
if should_fallback_to_aten(choices):
return aten_choice.output_node()
return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)

View File

@ -12,32 +12,13 @@ from torch._inductor.virtualized import V
from .. import config as inductor_config
from ..codegen.wrapper import PythonWrapperCodegen
from ..ir import _IntLike, ChoiceCaller, Layout, TensorBox
from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE, use_aten_gemm_kernels
from ..ir import _IntLike, Layout, TensorBox
from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE
log = logging.getLogger(__name__)
def should_fallback_to_aten(choices: list[ChoiceCaller]) -> bool:
if len(choices) == 0 and not use_aten_gemm_kernels():
if inductor_config.autotune_fallback_to_aten:
log.warning(
"No choices for GEMM, using ATen backend as fallback. "
"This behavior is being deprecated. Please add include Aten in max_autotune_gemm_backends."
)
return True
else:
log.warning(
"No choices for GEMM, chose not to fallback to ATen backend. "
"To temporarily change this behavior, set autotune_fallback_to_aten to True "
"via TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN=1, but this knob is being deprecated. "
"The long term fix is to include Aten in max_autotune_gemm_backends."
)
return False
return False
@SymbolicGridFn
def mm_grid(m, n, meta, *, cdiv):
"""

View File

@ -4,7 +4,7 @@ from typing import Any
import torch
from torch._inductor.kernel.mm_common import mm_args
from . import config as inductor_config, lowering
from . import lowering
from .codegen.cpp_gemm_template import CppGemmTemplate, CppWoqInt4GemmTemplate
from .codegen.cpp_utils import create_epilogue_with_attr
from .lowering import expand, register_lowering
@ -90,16 +90,6 @@ def register_woq_mm_ops() -> None:
epilogue_creator=_mul_epilogue, # type: ignore[arg-type]
)
if (
len(choices) == 0
and inductor_config.autotune_fallback_to_aten
and not use_aten_gemm_kernels()
):
log.warning("No choices for GEMM, using ATen backend as fallback")
return aten__weight_int8pack_mm.bind(
(mat1, mat2, scale), aten_layout
).output_node()
return autotune_select_algorithm(
"_weight_int8pack_mm", choices, [mat1, mat2, scale], aten_layout
)
@ -153,16 +143,6 @@ def register_woq_mm_ops() -> None:
[mat1, mat2, group_size, qScaleAndZeros],
)
if (
len(choices) == 0
and inductor_config.autotune_fallback_to_aten
and not use_aten_gemm_kernels()
):
log.warning("No choices for GEMM, using ATen backend as fallback")
return aten__weight_int4pack_mm_cpu.bind(
(mat1, mat2, group_size, qScaleAndZeros), aten_layout
).output_node()
# define functions to generate example inputs for weight and group size
# otherwise, autotuner generates example inputs of all zeros for them
def get_example_weight(x: torch._inductor.ir.IRNode) -> torch.Tensor: