From cf7447ae992ed61cfd23df24fc7e7735aedeb987 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 28 Mar 2025 17:07:52 +0000
Subject: [PATCH] Revert "cpp_wrapper: Fix even more tests (#147225)"

This reverts commit d25acac357ff8663a7787e57e6bc5e69987a8f9a.

Reverted https://github.com/pytorch/pytorch/pull/147225 on behalf of https://github.com/yangw-dev due to broke test internally test/inductor/test_benchmark_fusion ([comment](https://github.com/pytorch/pytorch/pull/147225#issuecomment-2761944564))
---
 test/inductor/test_benchmark_fusion.py    | 57 +++++++++--------------
 test/inductor/test_compiled_autograd.py   | 17 ++-----
 test/inductor/test_max_autotune.py        | 45 +++++++++---------
 test/inductor/test_torchinductor.py       |  3 ++
 torch/testing/_internal/inductor_utils.py |  6 ---
 5 files changed, 50 insertions(+), 78 deletions(-)

diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index ca25a92758b..2192e58f0f3 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -10,12 +10,7 @@ from torch._inductor.test_operators import realize
 from torch._inductor.utils import fresh_inductor_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import slowTest
-from torch.testing._internal.inductor_utils import (
-    get_func_call,
-    get_kernel_launch,
-    HAS_CPU,
-    HAS_CUDA,
-)
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 
 # Make the helper files in test/ importable
@@ -29,7 +24,6 @@ from inductor.test_torchinductor import (  # @manual=fbcode//caffe2/test/inducto
     check_model,
     check_model_cuda,
     copy_tests,
-    skip_if_cpp_wrapper,
 )
 from torch._inductor import config
 from torch._inductor.scheduler import Scheduler
@@ -132,7 +126,7 @@ class BenchmarkFusionTestTemplate:
 
         self.common(f, (a, b))
 
-    @config.patch(max_autotune_gemm_backends="TRITON")
+    @torch._inductor.config.patch(max_autotune_gemm_backends="TRITON")
     def test_avoid_register_spilling(self):
         if self.device != "cuda":
             raise unittest.SkipTest("CUDA only")
@@ -163,8 +157,8 @@ class BenchmarkFusionTestTemplate:
                 return
 
             # should be multiple triton invocations
-            FileCheck().check(get_func_call()).check_count(
-                get_kernel_launch(), 2, exactly=True
+            FileCheck().check("async_compile.wait").check_count(
+                ".run", 2, exactly=True
             ).run(out_code[0])
 
         with config.patch(
@@ -177,17 +171,9 @@ class BenchmarkFusionTestTemplate:
             _, out_code2 = run_and_get_code(foo_c, m, inp)
 
         for c in out_code[0], out_code2[0]:
-            FileCheck().check(get_func_call()).check(
-                "device_guard" if config.cpp_wrapper else "DeviceGuard"
-            ).check_count("empty_strided", 1, exactly=True).check_regex(
-                r"output_handles\[[0-9]+\] = buf[0-9]+\.release\(\)"
-                if config.cpp_wrapper
-                else r"buf[0-9]+ = buf[0-9]+; del buf[0-9]+"
-            ).check(
-                "" if config.cpp_wrapper else "return"
-            ).run(
-                c
-            )
+            FileCheck().check("async_compile.wait").check("DeviceGuard").check_count(
+                "empty_strided_cuda", 1, exactly=True
+            ).check_regex("buf[0-9]* = buf[0-9]*; del buf[0-9]*").check("return").run(c)
 
     def test_tield_kernel_fusion(self):
         def f(x):
@@ -210,7 +196,6 @@ if HAS_CUDA:
         @unittest.skipIf(
             torch.cuda.device_count() < 2, "The test need at least 2 devices"
         )
-        @skip_if_cpp_wrapper("This tests triton scheduling directly")
         def test_benchmark_on_non_zero_device(self):
             hit_count = 0
             with torch.cuda.device("cuda:0"):
@@ -280,7 +265,9 @@ if HAS_CUDA:
                 res, code = run_and_get_code(foo_c, m, inp)
 
             torch._dynamo.reset()
-            with config.patch(benchmark_epilogue_fusion=False):
+            with unittest.mock.patch.object(
+                torch._inductor.config, "benchmark_epilogue_fusion", False
+            ):
                 foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
                 with torch.no_grad():
                     res2, code2 = run_and_get_code(foo_c, m, inp)
@@ -289,34 +276,32 @@ if HAS_CUDA:
             return code, code2
 
         @fresh_inductor_cache()
-        @config.patch(max_autotune_gemm_backends="TRITON")
+        @torch._inductor.config.patch(max_autotune_gemm_backends="TRITON")
         def test_equivalent_template_code(self):
             code, code2 = self._equivalent_output_code_impl(256)
             for out_code in [code, code2]:
-                FileCheck().check(get_func_call()).check_count(
-                    "empty_strided", 1, exactly=True
-                ).check("triton_tem_fused_addmm_relu_0").check_count(
-                    ".reset()" if config.cpp_wrapper else "del", 3, exactly=True
+                FileCheck().check("def call").check_count(
+                    "empty_strided_cuda", 1, exactly=True
+                ).check("triton_tem_fused_addmm_relu_0.run").check_count(
+                    "del", 3, exactly=True
                 ).check(
-                    "" if config.cpp_wrapper else "return"
+                    "return"
                 ).run(
                     out_code[0]
                 )
 
         @fresh_inductor_cache()
-        @config.patch(max_autotune_gemm_backends="ATEN")
+        @torch._inductor.config.patch(max_autotune_gemm_backends="ATEN")
         def test_equivalent_extern_code(self):
             torch._dynamo.reset()
 
             code, code2 = self._equivalent_output_code_impl(512, 1, False)
 
             for out_code in [code, code2]:
-                FileCheck().check(get_func_call()).check_count(
-                    "empty_strided", 1, exactly=True
-                ).check("" if config.cpp_wrapper else "extern_kernels.").check_count(
-                    ".reset()" if config.cpp_wrapper else "del", 3, exactly=True
-                ).check(
-                    "" if config.cpp_wrapper else "return"
+                FileCheck().check("def call").check_count(
+                    "empty_strided_cuda", 1, exactly=True
+                ).check("extern_kernels.").check_count("del", 3, exactly=True).check(
+                    "return"
                 ).run(
                     out_code[0]
                 )
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 78823c13a26..7294417ad08 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -2801,12 +2801,7 @@ main()
             loss.backward()
             torch._inductor.config.triton.cudagraphs = False
 
-        if inductor_config.cpp_wrapper:
-            self.assertIn("skipping cudagraphs", stderr_msgs.getvalue())
-            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
-        else:
-            self.assertNotIn("skipping cudagraphs", stderr_msgs.getvalue())
-            self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
+        self.assertFalse("skipping cudagraphs" in stderr_msgs.getvalue())
 
     def test_cudagraphs_cpu_graph(self):
         from torch._dynamo.testing import reduce_to_scalar_loss
@@ -2839,10 +2834,7 @@ main()
             opt_bwd()
 
         self.assertEqual(counters["compiled_autograd"]["captures"], 1)
-        self.assertEqual(
-            counters["inductor"]["cudagraph_skips"],
-            2 if inductor_config.cpp_wrapper else 0,
-        )
+        self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
 
     @unittest.skipIf(not HAS_CUDA, "requires cuda")
     def test_cudagraphs_cpu_scalar_used_in_python_custom_op(self):
@@ -2935,10 +2927,7 @@ TORCH_LIBRARY(test_cudagraphs_cpu_scalar_used_in_cpp_custom_op, m) {
         # into it. We must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
         # In the future, we can consider having a cpu scalar movement pass sometime after we trace
         # into the custom C++ autograd::Function (like in AOTDispatcher)
-        self.assertEqual(
-            counters["inductor"]["cudagraph_skips"],
-            2 if inductor_config.cpp_wrapper else 1,
-        )
+        self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
     def test_logs(self):
         logs, ctx = logs_to_string(
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 7ab11c8be58..a62711196c8 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -46,14 +46,7 @@ from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu
-from torch.testing._internal.inductor_utils import (
-    get_func_call,
-    get_kernel_launch,
-    GPU_TYPE,
-    HAS_CPU,
-    HAS_CUDA,
-    HAS_GPU,
-)
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
 
 
 torch.set_float32_matmul_precision("high")
@@ -61,6 +54,14 @@ if HAS_CUDA:
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
+def _get_func_call() -> str:
+    return "void inductor_entry_impl(" if config.cpp_wrapper else "def call("
+
+
+def _get_kernel_launch() -> str:
+    return "call_triton_" if config.cpp_wrapper else ".run("
+
+
 def benchmark_choice(choice, args, out, expected_out, timings):
     result = choice.benchmark(*args, out=out)
     if expected_out is not None:
@@ -898,8 +899,8 @@ class TestMaxAutotune(TestCase):
 
         # mm kernel, and cos kernel
         count = 2 if using_triton_mm else 1
-        FileCheck().check(get_func_call()).check_count(
-            get_kernel_launch(), count, exactly=True
+        FileCheck().check(_get_func_call()).check_count(
+            _get_kernel_launch(), count, exactly=True
         ).run(code[0])
 
         def f(x, y):
@@ -911,8 +912,8 @@ class TestMaxAutotune(TestCase):
         f_c = torch.compile(mode="max-autotune-no-cudagraphs")(f)
         _, code = run_and_get_code(f_c, inps[0], inps[1])
         self.assertEqual(f_c(*inps), f(*inps), atol=0.03, rtol=0.25)
-        FileCheck().check(get_func_call()).check_count(
-            get_kernel_launch(), 2, exactly=True
+        FileCheck().check(_get_func_call()).check_count(
+            _get_kernel_launch(), 2, exactly=True
         ).run(code[0])
 
         def f(x, y):
@@ -1361,21 +1362,21 @@ class TestPrologueFusion(TestCase):
         )
 
     def check_code(self, code_str, num_kernels, num_allocs, num_deallocs):
-        FileCheck().check(get_func_call()).check_count(
-            get_kernel_launch(),
+        FileCheck().check(_get_func_call()).check_count(
+            _get_kernel_launch(),
             num_kernels,
             exactly=True,
         ).run(code_str)
 
         if num_allocs is not None:
-            FileCheck().check(get_func_call()).check_count(
+            FileCheck().check(_get_func_call()).check_count(
                 "empty_strided", num_allocs, exactly=True
             ).run(code_str)
 
         # skip the deallocation check when using cpp_wrapper; most deallocations happen
         # outside of our control via RAIIAtenTensorHandle
         if num_deallocs is not None and not config.cpp_wrapper:
-            FileCheck().check(get_func_call()).check_count(
+            FileCheck().check(_get_func_call()).check_count(
                 "del", num_deallocs, exactly=True
             ).run(code_str)
 
@@ -1515,8 +1516,8 @@ class TestPrologueFusion(TestCase):
 
         out, code = run_and_get_code(torch.compile(multi_use), x, y)
 
-        FileCheck().check(get_func_call()).check_count(
-            get_kernel_launch(), 2, exactly=True
+        FileCheck().check(_get_func_call()).check_count(
+            _get_kernel_launch(), 2, exactly=True
         ).run(code[0])
         self.assertEqual(out, multi_use(x, y), atol=0.05, rtol=0.05)
 
@@ -1525,8 +1526,8 @@ class TestPrologueFusion(TestCase):
 
         x = torch.rand([128, 128], device=GPU_TYPE)
         out, code = run_and_get_code(torch.compile(resolve_pending), x)
-        FileCheck().check(get_func_call()).check_count(
-            get_kernel_launch(), 1, exactly=True
+        FileCheck().check(_get_func_call()).check_count(
+            _get_kernel_launch(), 1, exactly=True
         ).run(code[0])
         self.assertEqual(out, resolve_pending(x), atol=0.05, rtol=0.05)
 
@@ -1549,8 +1550,8 @@ class TestPrologueFusion(TestCase):
 
         x = torch.rand([128, 128], dtype=torch.float16, device=GPU_TYPE)
         out, code = run_and_get_code(torch.compile(test_multiple_fusions), x)
-        FileCheck().check(get_func_call()).check_count(
-            get_kernel_launch(), 1, exactly=True
+        FileCheck().check(_get_func_call()).check_count(
+            _get_kernel_launch(), 1, exactly=True
         ).run(code[0])
         self.assertEqual(out, test_multiple_fusions(x), atol=0.05, rtol=0.05)
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 488649241d3..ad3986cd937 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10100,6 +10100,9 @@ class CommonTemplate:
         for x in (torch.randn(2, 3), torch.randn(2, 2), torch.randn(3, 2)):
             self.common(fn, (x,))
 
+    @skip_if_cpp_wrapper(
+        "cannot currently handle fallback ops with return types containing list[Tensor]"
+    )
     def test_kwargs(self):
         if self.device == GPU_TYPE:
             raise unittest.SkipTest("histogramdd only supports cpu")
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index 4461a62bbe5..1501a3bfcb3 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -210,12 +210,6 @@ def maybe_skip_size_asserts(op):
     else:
         return contextlib.nullcontext()
 
-def get_func_call() -> str:
-    return "void inductor_entry_impl(" if torch._inductor.config.cpp_wrapper else "def call("
-
-def get_kernel_launch() -> str:
-    return "call_triton_" if torch._inductor.config.cpp_wrapper else ".run("
-
 def clone_preserve_strides_offset(x, device=None):
     if not isinstance(x, torch.Tensor):
         return x