[Inductor]Cleanup autotune_fallback_to_aten post-deprecation (#154331)

Fixes #153298

This PR is the 3rd and final step of #147479
All references to autotune_fallback_to_aten have been removed, and the feature is now deprecated.
All calls to should_fallback_to_aten() were also removed, as they were deemed unnecessary.

[henrylhtsang](https://github.com/henrylhtsang)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/154331
Approved by: https://github.com/henrylhtsang
This commit is contained in:
Joaquin 2025-05-29 20:29:54 +00:00 committed by PyTorch MergeBot
parent 629fca295e
commit cb56df55dc
8 changed files with 5 additions and 122 deletions

View File

@ -80,7 +80,6 @@ def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) ->
@dataclass(frozen=True, kw_only=True) @dataclass(frozen=True, kw_only=True)
class ExperimentConfig: class ExperimentConfig:
autotune_fallback_to_aten: bool = False
max_autotune: bool = True max_autotune: bool = True
coordinate_descent_tuning: bool = True coordinate_descent_tuning: bool = True
max_autotune_gemm_backends: str = "ATEN" max_autotune_gemm_backends: str = "ATEN"
@ -91,7 +90,6 @@ class ExperimentConfig:
def to_options(self) -> dict[str, Any]: def to_options(self) -> dict[str, Any]:
return { return {
"autotune_fallback_to_aten": self.autotune_fallback_to_aten,
"max_autotune": self.max_autotune, "max_autotune": self.max_autotune,
"coordinate_descent_tuning": self.coordinate_descent_tuning, "coordinate_descent_tuning": self.coordinate_descent_tuning,
"max_autotune_gemm_backends": self.max_autotune_gemm_backends, "max_autotune_gemm_backends": self.max_autotune_gemm_backends,

View File

@ -129,7 +129,6 @@ use_evt_config = config.patch(
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet "benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
"cuda.cutlass_tma_only": True, "cuda.cutlass_tma_only": True,
"cuda.cutlass_epilogue_fusion_enabled": True, "cuda.cutlass_epilogue_fusion_enabled": True,
@ -141,7 +140,6 @@ fp8_config = config.patch(
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
"benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet "benchmark_epilogue_fusion": False, # EVT doesn't support benchmark fusion yet
"cuda.cutlass_tma_only": True, "cuda.cutlass_tma_only": True,
} }
@ -269,7 +267,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"compile_threads": 4, "compile_threads": 4,
"cuda.cutlass_max_profiling_configs": 4, "cuda.cutlass_max_profiling_configs": 4,
"autotune_fallback_to_aten": False,
} }
): ):
Y_compiled = torch.compile(torch.mm)(a, b) Y_compiled = torch.compile(torch.mm)(a, b)
@ -308,7 +305,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"compile_threads": 4, "compile_threads": 4,
"cuda.cutlass_max_profiling_configs": 4, "cuda.cutlass_max_profiling_configs": 4,
"autotune_fallback_to_aten": False,
} }
): ):
for x_shape in x_shapes: for x_shape in x_shapes:
@ -336,7 +332,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"compile_threads": 4, "compile_threads": 4,
"cuda.cutlass_max_profiling_configs": 4, "cuda.cutlass_max_profiling_configs": 4,
"autotune_fallback_to_aten": False,
} }
): ):
Y_compiled = torch.compile(torch.bmm)(a, b) Y_compiled = torch.compile(torch.bmm)(a, b)
@ -369,7 +364,6 @@ class TestCutlassBackend(TestCase):
"autotune_in_subproc": True, "autotune_in_subproc": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
} }
): ):
from torch._inductor.utils import run_and_get_code from torch._inductor.utils import run_and_get_code
@ -412,7 +406,6 @@ class TestCutlassBackend(TestCase):
"autotune_in_subproc": True, "autotune_in_subproc": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
"cuda.cutlass_max_profiling_swizzle_options": [ "cuda.cutlass_max_profiling_swizzle_options": [
1, 1,
2, 2,
@ -482,7 +475,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
} }
), dynamo_config.patch({"error_on_recompile": dynamic}): ), dynamo_config.patch({"error_on_recompile": dynamic}):
expected = [model(*input) for input in inputs] expected = [model(*input) for input in inputs]
@ -560,7 +552,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
} }
), dynamo_config.patch({"error_on_recompile": dynamic}): ), dynamo_config.patch({"error_on_recompile": dynamic}):
expected = [model(*input) for input in inputs] expected = [model(*input) for input in inputs]
@ -622,7 +613,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
} }
): ):
expected = [model(*input) for input in inputs] expected = [model(*input) for input in inputs]
@ -657,7 +647,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels "cuda.cutlass_op_allowlist_regex": "stream_k", # only stream-k GEMM Kernels
"autotune_fallback_to_aten": False,
} }
): ):
for M, K, N in ( for M, K, N in (
@ -713,7 +702,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 4, "cuda.cutlass_max_profiling_configs": 4,
"cuda.version": "12.2", # required to enable the Kernels we need "cuda.version": "12.2", # required to enable the Kernels we need
"autotune_fallback_to_aten": False,
} }
): ):
counters["inductor"]["cuda_epilogue_fusion_counter"] = 0 counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
@ -812,7 +800,6 @@ class TestCutlassBackend(TestCase):
"autotune_in_subproc": True, "autotune_in_subproc": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
} }
): ):
Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b) Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
@ -831,7 +818,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"autotune_in_subproc": False, "autotune_in_subproc": False,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"autotune_fallback_to_aten": False,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
} }
): ):
@ -870,7 +856,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"autotune_in_subproc": False, "autotune_in_subproc": False,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"autotune_fallback_to_aten": False,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
} }
): ):
@ -904,7 +889,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"autotune_in_subproc": False, "autotune_in_subproc": False,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"autotune_fallback_to_aten": False,
"cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem", "cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
} }
@ -951,7 +935,6 @@ class TestCutlassBackend(TestCase):
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_local_cache": True, "autotune_local_cache": True,
"autotune_fallback_to_aten": False,
} }
): ):
Y_compiled = torch.compile(mm, dynamic=dynamic)(a_sparse, b) Y_compiled = torch.compile(mm, dynamic=dynamic)(a_sparse, b)
@ -1107,7 +1090,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
} }
), mock.patch( ), mock.patch(
"torch._inductor.kernel.mm.autotune_select_algorithm", "torch._inductor.kernel.mm.autotune_select_algorithm",
@ -1166,7 +1148,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
"cuda.cutlass_presets": presets, "cuda.cutlass_presets": presets,
} }
), mock.patch( ), mock.patch(
@ -1272,7 +1253,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
"cuda.generate_test_runner": True, # put standalone runner in the generated code "cuda.generate_test_runner": True, # put standalone runner in the generated code
} }
): ):
@ -1389,7 +1369,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": max_autotune_gemm_backends, "max_autotune_gemm_backends": max_autotune_gemm_backends,
"cuda.cutlass_max_profiling_configs": 2, "cuda.cutlass_max_profiling_configs": 2,
"autotune_fallback_to_aten": False,
} }
): ):
compiled = torch.compile(torch.mm) compiled = torch.compile(torch.mm)
@ -1413,7 +1392,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
} }
): ):
_ = torch.compile(model)(B) _ = torch.compile(model)(B)
@ -1436,7 +1414,6 @@ class TestCutlassBackend(TestCase):
"max_autotune": True, "max_autotune": True,
"max_autotune_gemm_backends": "CUTLASS", "max_autotune_gemm_backends": "CUTLASS",
"cuda.cutlass_max_profiling_configs": 1, "cuda.cutlass_max_profiling_configs": 1,
"autotune_fallback_to_aten": False,
} }
): ):
_ = torch.compile(model)(B) _ = torch.compile(model)(B)

View File

@ -139,7 +139,6 @@ class TestMaxAutotune(TestCase):
with config.patch( with config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1", "triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma", "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
} }
@ -164,7 +163,6 @@ class TestMaxAutotune(TestCase):
with self.assertRaises(BackendCompilerFailed) as context, config.patch( with self.assertRaises(BackendCompilerFailed) as context, config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1", "triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma", "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
} }
@ -198,7 +196,6 @@ class TestMaxAutotune(TestCase):
with config.patch( with config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1", "triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma", "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
} }
@ -259,7 +256,6 @@ class TestMaxAutotune(TestCase):
with config.patch( with config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1", "triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma", "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
} }
@ -285,7 +281,6 @@ class TestMaxAutotune(TestCase):
with self.assertRaises(BackendCompilerFailed) as context, config.patch( with self.assertRaises(BackendCompilerFailed) as context, config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1", "triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma", "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
} }
@ -321,7 +316,6 @@ class TestMaxAutotune(TestCase):
with config.patch( with config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": "1", "triton.enable_persistent_tma_matmul": "1",
"test_configs.autotune_choice_name_regex": "mm_persistent_tma", "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
} }
@ -380,7 +374,6 @@ class TestMaxAutotune(TestCase):
with config.patch( with config.patch(
{ {
"max_autotune": True, "max_autotune": True,
"autotune_fallback_to_aten": False,
"triton.enable_persistent_tma_matmul": True, "triton.enable_persistent_tma_matmul": True,
"max_autotune_gemm_backends": "TRITON", "max_autotune_gemm_backends": "TRITON",
"test_configs.autotune_choice_name_regex": "tma", "test_configs.autotune_choice_name_regex": "tma",
@ -868,7 +861,6 @@ class TestMaxAutotune(TestCase):
@config.patch( @config.patch(
max_autotune=True, max_autotune=True,
max_autotune_gemm_backends="", max_autotune_gemm_backends="",
autotune_fallback_to_aten=False,
) )
def test_no_valid_choices(self): def test_no_valid_choices(self):
a = torch.zeros([2, 2], device=GPU_TYPE) a = torch.zeros([2, 2], device=GPU_TYPE)
@ -881,7 +873,6 @@ class TestMaxAutotune(TestCase):
@config.patch( @config.patch(
max_autotune=True, max_autotune=True,
max_autotune_gemm_backends="TRITON", max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
) )
def test_inf_timing(self, multi_template): def test_inf_timing(self, multi_template):
from unittest.mock import patch from unittest.mock import patch
@ -955,7 +946,6 @@ class TestMaxAutotune(TestCase):
@config.patch( @config.patch(
max_autotune=True, max_autotune=True,
max_autotune_gemm_backends="TRITON", max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
) )
def test_max_autotune_decompose_k(self, sizes, dtype, dynamic): def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
fp16_red_setting = ( fp16_red_setting = (
@ -1058,7 +1048,6 @@ class TestMaxAutotune(TestCase):
@config.patch( @config.patch(
max_autotune=True, max_autotune=True,
max_autotune_gemm_backends="TRITON", max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
) )
def test_max_autotune_decompose_k_dynamic_input(self): def test_max_autotune_decompose_k_dynamic_input(self):
def f(a, b): def f(a, b):
@ -1106,7 +1095,6 @@ class TestMaxAutotune(TestCase):
@config.patch( @config.patch(
max_autotune=True, max_autotune=True,
max_autotune_gemm_backends="TRITON", max_autotune_gemm_backends="TRITON",
autotune_fallback_to_aten=False,
) )
def test_max_autotune_decompose_k_output_stride(self): def test_max_autotune_decompose_k_output_stride(self):
def f(a, b): def f(a, b):

View File

@ -436,11 +436,8 @@ max_autotune_gemm_search_space: Literal["DEFAULT", "EXHAUSTIVE"] = os.environ.ge
"TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE", "DEFAULT" "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE", "DEFAULT"
).upper() # type: ignore[assignment] ).upper() # type: ignore[assignment]
# NOTE: This feature is deprecated and will be defauled to False in the future. # DEPRECATED. This setting is ignored.
# Whether we fall back to ATen or hard error when no matches are found during autotuning autotune_fallback_to_aten = False
autotune_fallback_to_aten = (
os.environ.get("TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN", "0") == "1"
)
# the value used as a fallback for the unbacked SymInts # the value used as a fallback for the unbacked SymInts
# that can appear in the input shapes (e.g., in autotuning) # that can appear in the input shapes (e.g., in autotuning)

View File

@ -27,7 +27,6 @@ from .mm_common import (
mm_args, mm_args,
mm_config_kwargs, mm_config_kwargs,
mm_options, mm_options,
should_fallback_to_aten,
) )
@ -233,9 +232,6 @@ def tuned_bmm(mat1, mat2, out_dtype=None, *, layout=None):
if use_ck_gemm_template(layout, m, n, k): if use_ck_gemm_template(layout, m, n, k):
CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2]) CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
if should_fallback_to_aten(choices):
choices.append(aten_bmm.bind((mat1, mat2), layout))
return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout) return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)

View File

@ -60,7 +60,6 @@ from .mm_common import (
persistent_mm_options, persistent_mm_options,
scale_mm_epilogue, scale_mm_epilogue,
scaled_mm_options, scaled_mm_options,
should_fallback_to_aten,
) )
@ -783,8 +782,6 @@ def tuned_mm(mat1, mat2, *, layout=None):
for k in inductor_config.external_matmul: for k in inductor_config.external_matmul:
choices.append(lazy_register_extern_choice(k).bind((mat1, mat2), layout)) choices.append(lazy_register_extern_choice(k).bind((mat1, mat2), layout))
if should_fallback_to_aten(choices):
return aten_mm.bind((mat1, mat2), aten_layout).output_node()
return autotune_select_algorithm(name, choices, [mat1, mat2], layout) return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
@ -834,15 +831,11 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
**mm_options(config, m, n, k, layout), **mm_options(config, m, n, k, layout),
) )
if should_fallback_to_aten(choices):
return aten__int_mm.bind((mat1, mat2), layout).output_node()
return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout) return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
@register_lowering(aten.addmm, type_promotion_kind=None) @register_lowering(aten.addmm, type_promotion_kind=None)
def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
device_type = ir.get_device_type(mat1) device_type = ir.get_device_type(mat1)
m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout) m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
static_shape, is_nonzero = _is_static_problem(layout) static_shape, is_nonzero = _is_static_problem(layout)
@ -973,30 +966,6 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
has_bias=True, has_bias=True,
) )
if should_fallback_to_aten(choices):
choices.append(
aten_addmm.bind(
(inp_expanded, mat1, mat2),
layout,
ordered_kwargs_for_cpp_kernel,
alpha=alpha,
beta=beta,
)
)
if (
inp_expanded.get_stride()[0] == 0
and inp_expanded.get_device().type == "cuda"
and inductor_config.triton.autotune_cublasLt
):
# unexpand inp to make sure fused addmm from cublasLt is used
choices.insert(
0,
aten_bias_addmm.bind(
(inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
),
)
return autotune_select_algorithm( return autotune_select_algorithm(
"addmm", choices, [inp_expanded, mat1, mat2], layout "addmm", choices, [inp_expanded, mat1, mat2], layout
) )
@ -1198,9 +1167,6 @@ def tuned_scaled_mm(
if is_nonzero and use_ck_gemm_template(layout, m, n, k): if is_nonzero and use_ck_gemm_template(layout, m, n, k):
CKGemmTemplate.add_ck_gemm_choices(choices, layout, input_nodes) CKGemmTemplate.add_ck_gemm_choices(choices, layout, input_nodes)
if should_fallback_to_aten(choices):
return aten_choice.output_node()
return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout) return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)

View File

@ -12,32 +12,13 @@ from torch._inductor.virtualized import V
from .. import config as inductor_config from .. import config as inductor_config
from ..codegen.wrapper import PythonWrapperCodegen from ..codegen.wrapper import PythonWrapperCodegen
from ..ir import _IntLike, ChoiceCaller, Layout, TensorBox from ..ir import _IntLike, Layout, TensorBox
from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE, use_aten_gemm_kernels from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def should_fallback_to_aten(choices: list[ChoiceCaller]) -> bool:
if len(choices) == 0 and not use_aten_gemm_kernels():
if inductor_config.autotune_fallback_to_aten:
log.warning(
"No choices for GEMM, using ATen backend as fallback. "
"This behavior is being deprecated. Please add include Aten in max_autotune_gemm_backends."
)
return True
else:
log.warning(
"No choices for GEMM, chose not to fallback to ATen backend. "
"To temporarily change this behavior, set autotune_fallback_to_aten to True "
"via TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN=1, but this knob is being deprecated. "
"The long term fix is to include Aten in max_autotune_gemm_backends."
)
return False
return False
@SymbolicGridFn @SymbolicGridFn
def mm_grid(m, n, meta, *, cdiv): def mm_grid(m, n, meta, *, cdiv):
""" """

View File

@ -4,7 +4,7 @@ from typing import Any
import torch import torch
from torch._inductor.kernel.mm_common import mm_args from torch._inductor.kernel.mm_common import mm_args
from . import config as inductor_config, lowering from . import lowering
from .codegen.cpp_gemm_template import CppGemmTemplate, CppWoqInt4GemmTemplate from .codegen.cpp_gemm_template import CppGemmTemplate, CppWoqInt4GemmTemplate
from .codegen.cpp_utils import create_epilogue_with_attr from .codegen.cpp_utils import create_epilogue_with_attr
from .lowering import expand, register_lowering from .lowering import expand, register_lowering
@ -90,16 +90,6 @@ def register_woq_mm_ops() -> None:
epilogue_creator=_mul_epilogue, # type: ignore[arg-type] epilogue_creator=_mul_epilogue, # type: ignore[arg-type]
) )
if (
len(choices) == 0
and inductor_config.autotune_fallback_to_aten
and not use_aten_gemm_kernels()
):
log.warning("No choices for GEMM, using ATen backend as fallback")
return aten__weight_int8pack_mm.bind(
(mat1, mat2, scale), aten_layout
).output_node()
return autotune_select_algorithm( return autotune_select_algorithm(
"_weight_int8pack_mm", choices, [mat1, mat2, scale], aten_layout "_weight_int8pack_mm", choices, [mat1, mat2, scale], aten_layout
) )
@ -153,16 +143,6 @@ def register_woq_mm_ops() -> None:
[mat1, mat2, group_size, qScaleAndZeros], [mat1, mat2, group_size, qScaleAndZeros],
) )
if (
len(choices) == 0
and inductor_config.autotune_fallback_to_aten
and not use_aten_gemm_kernels()
):
log.warning("No choices for GEMM, using ATen backend as fallback")
return aten__weight_int4pack_mm_cpu.bind(
(mat1, mat2, group_size, qScaleAndZeros), aten_layout
).output_node()
# define functions to generate example inputs for weight and group size # define functions to generate example inputs for weight and group size
# otherwise, autotuner generates example inputs of all zeros for them # otherwise, autotuner generates example inputs of all zeros for them
def get_example_weight(x: torch._inductor.ir.IRNode) -> torch.Tensor: def get_example_weight(x: torch._inductor.ir.IRNode) -> torch.Tensor: