[inductor] Fix mm decomposition evaluating symints (#158998)

Fixes #154111 Resolves an issue during compilation with dynamic shapes where `torch._inductor.decomposition.mm` evaluates the SymInt expression for the input tensor due to a for loop, and thus the output tensor is not dynamically shaped. This issue is limited to (Mx1)x(1xN) small matrix multiplications, and creates an explicit error with tensor subclasses such as DTensor. The proposed fix replaces the loop with a simple product instead. Benchmark currently running https://hud.pytorch.org/benchmark/compilers Pull Request resolved: https://github.com/pytorch/pytorch/pull/158998 Approved by: https://github.com/jansel, https://github.com/BoyuanFeng
2025-12-06 12:20:52 +01:00 · 2025-07-30 16:34:12 +00:00 · 2025-07-30 16:34:12 +00:00 · 24d07b3a67
commit 24d07b3a67
parent 90fd06be71
3 changed files with 108 additions and 23 deletions
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -1,32 +1,32 @@
-add_loop_eager,compile_time_instruction_count,3070000000,0.10
+add_loop_eager,compile_time_instruction_count,3070000000,0.1



-add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.10
+add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1



-add_loop_inductor,compile_time_instruction_count,30280000000,0.10
+add_loop_inductor,compile_time_instruction_count,30280000000,0.1



-add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39910000000,0.10
+add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39910000000,0.1



-add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.10
+add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.1



-basic_modules_ListOfLinears_eager,compile_time_instruction_count,969100000,0.10
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,969100000,0.1



-basic_modules_ListOfLinears_inductor,compile_time_instruction_count,18030000000,0.10
+basic_modules_ListOfLinears_inductor,compile_time_instruction_count,15240000000,0.1



-basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.10
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.1



@ -34,56 +34,56 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,11090000



-update_hint_regression,compile_time_instruction_count,1719000000,0.10
+update_hint_regression,compile_time_instruction_count,1719000000,0.1



-sum_floordiv_regression,compile_time_instruction_count,966100000,0.10
+sum_floordiv_regression,compile_time_instruction_count,966100000,0.1



-symint_sum,compile_time_instruction_count,3237000000,0.10
+symint_sum,compile_time_instruction_count,3237000000,0.1



-symint_sum_loop,compile_time_instruction_count,4299000000,0.10
+symint_sum_loop,compile_time_instruction_count,4299000000,0.1



-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.10
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1



-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.10
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1



-aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.10
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1



-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.10
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1



-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.10
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1



-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.10
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1



-mm_loop_inductor_gpu,compile_time_instruction_count,4461000000,0.10
+mm_loop_inductor_gpu,compile_time_instruction_count,4461000000,0.1



-mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8417000000,0.10
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8417000000,0.1



-basic_NestedModule_eager,compile_time_instruction_count,8348000000,0.10
+basic_NestedModule_eager,compile_time_instruction_count,8348000000,0.1



-basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.10
+basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
--- a/test/inductor/test_mmdecomp.py
+++ b/test/inductor/test_mmdecomp.py
@ -6,6 +6,13 @@ from typing import Union

 import torch
 from torch._inductor import config
+from torch._inductor.decomposition import mm
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.symbolic_shapes import (
+    DimDynamic,
+    ShapeEnv,
+    StatelessSymbolicContext,
+)
 from torch.testing._internal.common_cuda import SM80OrLater
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_nn import NNTestCase
@ -78,6 +85,19 @@ def torch_baddbmm(add, b, c, alpha, beta):
    return torch.baddbmm(add, b, c, alpha=alpha, beta=beta)


+def create_fake_tensor_with_dynamic_size(x, fake_mode):
+    with fake_mode:
+        dynamic_sizes = [DimDynamic.DYNAMIC for _ in range(x.dim())]
+        dynamic_strides = [DimDynamic.INFER_STRIDE for _ in range(x.dim())]
+        return fake_mode.from_tensor(
+            x,
+            symbolic_context=StatelessSymbolicContext(
+                dynamic_sizes=dynamic_sizes,
+                dynamic_strides=dynamic_strides,
+            ),
+        )
+
+
 # The shapes we test on
 ts_list = [
    (1, 32, 32, 1),
@ -187,6 +207,71 @@ class TestDecomp(NNTestCase):
            init_tensor([[[1], [2], [3], [4]]] * bs, dtype=dtype, device=device),
        )

+    @parametrize("dtype", [torch.float, torch.bfloat16])
+    def test_dynamic_shape_mm(self, device, dtype):
+        # Test that the mm decomp does not evaluate expressions for dynamic shapes
+
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+
+        # Only test decomp for cpu to match fake tensors from dynamo
+        if device != "cpu":
+            return
+
+        for t_size in ts_list:
+            ((a1_0, a1_1, a2_0, a2_1)) = t_size
+
+            # Create the fake tensors
+            t1 = create_fake_tensor_with_dynamic_size(
+                rand_math_tensor((a1_0, a1_1), dtype=dtype, device=device),
+                fake_mode,
+            )
+            t2 = create_fake_tensor_with_dynamic_size(
+                rand_math_tensor((a2_0, a2_1), dtype=dtype, device=device),
+                fake_mode,
+            )
+
+            # Save the expression types to check if any symints are evaluated
+            og_t1_expr_types = [
+                type(d.node.expr) if type(d) is torch.SymInt else int for d in t1.size()
+            ]
+            og_t2_expr_types = [
+                type(d.node.expr) if type(d) is torch.SymInt else int for d in t2.size()
+            ]
+
+            r = mm(t1, t2)
+
+            # Make sure all symints are not evaluated
+            new_t1_expr_types = [
+                type(d.node.expr) if type(d) is torch.SymInt else int for d in t1.size()
+            ]
+            new_t2_expr_types = [
+                type(d.node.expr) if type(d) is torch.SymInt else int for d in t2.size()
+            ]
+            self.assertTrue(
+                all(
+                    og_t1_expr_types[i] == new_t1_expr_types[i]
+                    for i in range(len(og_t1_expr_types))
+                )
+            )
+            self.assertTrue(
+                all(
+                    og_t2_expr_types[i] == new_t2_expr_types[i]
+                    for i in range(len(og_t2_expr_types))
+                )
+            )
+
+            if r is not NotImplemented:
+                # Check that the output is well formed
+                self.assertEqual(t1.size(0), r.size(0))
+                self.assertEqual(t2.size(1), r.size(1))
+                r_expr_types = [
+                    type(d.node.expr) if type(d) is torch.SymInt else int
+                    for d in r.size()
+                ]
+                self.assertTrue(r_expr_types[0] == og_t1_expr_types[0])
+                self.assertTrue(r_expr_types[1] == og_t2_expr_types[1])
+

 device_types = ("cpu", GPU_TYPE)
 instantiate_device_type_tests(TestDecomp, globals(), only_for=device_types)
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@ -367,7 +367,7 @@ def mm(
            and guard_or_false((torch.numel(self) + torch.numel(input2)) <= 32)
        ):
            counters["inductor"]["decompose_mm"] += 1
-            return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
+            return self * input2
        if statically_known_true(self.size(0) == 1) and statically_known_true(
            input2.size(-1) == 1
        ):