Revert "[inductor][test] Skip triton tests for MPS as well, also change reason for skipping SM89 to not IS_BIG_GPU (#151506)"

This reverts commit 6246c7d62c. Reverted https://github.com/pytorch/pytorch/pull/151506 on behalf of https://github.com/henrylhtsang due to seems to be breaking some rocm mi300 run ([comment](https://github.com/pytorch/pytorch/pull/151506#issuecomment-2815999009))
2025-12-06 12:20:52 +01:00 · 2025-04-18 18:40:15 +00:00 · 2025-04-18 18:40:15 +00:00 · e434a9152e
commit e434a9152e
parent cccfc146fe
5 changed files with 21 additions and 34 deletions
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@ -27,7 +27,11 @@ from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQu
 from torch.export import Dim, export, export_for_training
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM80OrLater
+from torch.testing._internal.common_cuda import (
+    IS_SM89,
+    PLATFORM_SUPPORTS_FP8,
+    SM80OrLater,
+)
 from torch.testing._internal.common_device_type import (
    _has_sufficient_memory,
    skipCUDAIf,
@ -49,7 +53,7 @@ from torch.testing._internal.common_utils import (
    TEST_WITH_ROCM,
 )
 from torch.testing._internal.custom_tensor import CustomTensorPlainOut
-from torch.testing._internal.inductor_utils import GPU_TYPE, IS_BIG_GPU
+from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import HAS_GPU, requires_gpu
 from torch.utils import _pytree as pytree
@ -509,9 +513,6 @@ class AOTInductorTestsTemplate:
                model = LinearModel(device=self.device)
                self.check_model(model, example_inputs)

-    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
-    )
    def test_linear_dynamic_maxautotune(self):
        if self.device == "cpu":
            raise unittest.SkipTest("using triton backend only is not supported on CPU")
@ -630,9 +631,6 @@ class AOTInductorTestsTemplate:
            actual = AOTIRunnerUtil.legacy_run(self.device, model, example_inputs)
            self.assertTrue(same(model(*example_inputs), actual))

-    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
-    )
    @skip("Test was marked as expected failure, but does not fail always anymore.")
    def test_dynamic_smem_above_default_limit(self):
        if self.device == "cpu":
@ -958,7 +956,8 @@ class AOTInductorTestsTemplate:
        )

    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+        IS_SM89,
+        "Triton not supported as Inductor GEMM backend on SM89, see https://github.com/pytorch/pytorch/issues/150390",
    )
    def test_addmm_multiple_dynamic(self):
        if self.device == "cpu":
@ -1001,7 +1000,8 @@ class AOTInductorTestsTemplate:
        )

    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+        IS_SM89,
+        "Triton not supported as Inductor GEMM backend on SM89, see https://github.com/pytorch/pytorch/issues/150390",
    )
    def test_bmm_multiple_dynamic(self):
        if self.device == "cpu":
@ -3128,9 +3128,6 @@ class AOTInductorTestsTemplate:
        inputs = (torch.randn(4, 4, device=self.device),)
        self.check_model(Model(), inputs)

-    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
-    )
    def test_convolution(self):
        if self.device == "cpu":
            raise unittest.SkipTest("using triton backend only is not supported on CPU")
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@ -10,12 +10,7 @@ from torch._inductor.test_operators import realize
 from torch._inductor.utils import fresh_inductor_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import slowTest
-from torch.testing._internal.inductor_utils import (
-    get_func_call,
-    HAS_CPU,
-    HAS_CUDA,
-    IS_BIG_GPU,
-)
+from torch.testing._internal.inductor_utils import get_func_call, HAS_CPU, HAS_CUDA


 # Make the helper files in test/ importable
@ -33,6 +28,7 @@ from inductor.test_torchinductor import (  # @manual=fbcode//caffe2/test/inducto
 )
 from torch._inductor import config
 from torch._inductor.scheduler import Scheduler
+from torch.testing._internal.common_cuda import IS_SM89


 class TestCase(InductorTestCase):
@ -133,7 +129,8 @@ class BenchmarkFusionTestTemplate:
        self.common(f, (a, b))

    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+        IS_SM89,
+        "Triton not supported as Inductor GEMM backend on SM89, see https://github.com/pytorch/pytorch/issues/150390",
    )
    @config.patch(max_autotune_gemm_backends="TRITON")
    def test_avoid_register_spilling(self):
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@ -41,7 +41,6 @@ from torch.testing._internal.common_utils import (
    TEST_WITH_ROCM,
    xfailIfPy312Plus,
 )
-from torch.testing._internal.inductor_utils import IS_BIG_GPU


 if TEST_WITH_ROCM:
@ -1134,9 +1133,6 @@ class CudaReproTests(TestCase):

        self.assertEqual(expect, actual)

-    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
-    )
    @config.patch(
        {
            "max_autotune_gemm_backends": "TRITON",
--- a/test/inductor/test_kernel_benchmark.py
+++ b/test/inductor/test_kernel_benchmark.py
@ -4,7 +4,6 @@ import contextlib
 import os
 import subprocess
 import sys
-import unittest
 from unittest.mock import patch

 import torch
@ -16,7 +15,7 @@ from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import fresh_inductor_cache
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import xfailIfSM89
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU


 class TestKernelBenchmark(TestCase):
@ -149,9 +148,6 @@ class TestKernelBenchmark(TestCase):
    @config.patch(
        max_autotune=True, max_autotune_gemm_backends="TRITON", force_shape_pad=True
    )
-    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
-    )
    @fresh_inductor_cache()
    def test_matmul_triton_kernel_benchmark(self):
        M = 12544
@ -467,9 +463,6 @@ class TestKernelBenchmark(TestCase):
        compiled_module = self.get_compiled_module()
        self.verify_remove_inductor_deps(compiled_module)

-    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
-    )
    @config.patch("triton.unique_kernel_names", True)
    @config.patch("triton.unique_kernel_names", True)
    @config.patch(benchmark_kernel=False)
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@ -121,6 +121,7 @@ from torch._inductor.compile_fx import (
    complex_memory_overlap,
 )
 from torch._inductor.utils import has_torchvision_roi_align
+from torch.testing._internal.common_cuda import IS_SM89
 from torch.testing._internal.common_utils import slowTest
 from torch.testing._internal.inductor_utils import (
    clone_preserve_strides_offset,
@ -129,7 +130,6 @@ from torch.testing._internal.inductor_utils import (
    HAS_GPU,
    HAS_MPS,
    HAS_MULTIGPU,
-    IS_BIG_GPU,
    requires_gpu,
    RUN_CPU,
    RUN_GPU,
@ -3839,7 +3839,8 @@ class CommonTemplate:
            torch.compile(fn)(t)

    @unittest.skipIf(
-        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+        IS_SM89,
+        "Triton not supported as Inductor GEMM backend on SM89, see https://github.com/pytorch/pytorch/issues/150390",
    )
    @config.patch(
        {
@ -3848,6 +3849,9 @@ class CommonTemplate:
        }
    )
    def test_linear_dynamic_maxautotune(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
        @torch.compile(dynamic=True)
        class Model(torch.nn.Module):
            def __init__(self) -> None: