[ROCm][CI] remove relaxed tolerance for tf32 tests (#166478)

Instead of relaxing tolerances for certain unit tests that exercise TF32 on MI300, skip the tests until hipblaslt accuracy is improved. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166478 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
2025-12-06 00:20:18 +01:00 · 2025-10-31 16:15:39 +00:00 · 2025-10-31 16:15:39 +00:00 · c3b71d5499
commit c3b71d5499
parent 1e3600b528
7 changed files with 36 additions and 17 deletions
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@ -47,9 +47,11 @@ from torch.testing._internal.common_utils import (
    gradgradcheck,
    instantiate_parametrized_tests,
    MACOS_VERSION,
+    MI300_ARCH,
    parametrize as parametrize_test,
    run_tests,
    set_default_dtype,
+    skipIfRocmArch,
    subtest,
    TEST_SCIPY,
    TEST_WITH_ROCM,
@ -3393,8 +3395,9 @@ class TestConvolutionNNDeviceType(NNTestCase):
        F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device))
        F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))

+    @skipIfRocmArch(MI300_ARCH)
    @onlyCUDA
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @tf32_on_and_off(0.005)
    def test_Conv2d_size_1_kernel(self, device):
        x_cpu = torch.randn(2, 3, 5, 5)
        conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
@ -3425,8 +3428,9 @@ class TestConvolutionNNDeviceType(NNTestCase):
            exact_device=False,
        )

+    @skipIfRocmArch(MI300_ARCH)
    @onlyCUDA
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @tf32_on_and_off(0.005)
    def test_ConvTranspose2d_size_1_kernel(self, device):
        x_cpu = torch.randn(2, 3, 5, 5)
        conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -6967,7 +6967,8 @@ class TestCompileKernel(TestCase):
        with self.assertRaises(RuntimeError):
            kernel.set_shared_memory_config(excessive_shared_mem)

-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @skipIfRocmArch(MI300_ARCH)
+    @tf32_on_and_off(0.005)
    @unittest.skipIf(not TEST_CUDA, "No CUDA")
    def test_compile_kernel_advanced(self):
        # Test matrix multiplication
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -755,10 +755,11 @@ class TestLinalg(TestCase):
            cholesky_test_helper(3, batchsize, upper)

    @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4})
+    @skipIfRocmArch(MI300_ARCH)
    @skipCUDAIfNoMagma
    @skipCPUIfNoLapack
    @dtypes(*floating_and_complex_types())
-    @tf32_on_and_off(0.1 if TEST_WITH_ROCM else 0.01)
+    @tf32_on_and_off(0.01)
    @reduced_f32_on_and_off(0.01)
    def test_old_cholesky(self, device, dtype):
        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
@ -7410,9 +7411,10 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
    def test_addmm_gelu(self, device, dtype):
        self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)

+    @skipIfRocmArch(MI300_ARCH)
    @dtypes(torch.float, torch.double)
    @dtypesIfCUDA(*floating_and_complex_types())
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @tf32_on_and_off(0.005)
    @reduced_f32_on_and_off(0.005)
    def test_addmm_sizes(self, device, dtype):
        for m in [0, 1, 25]:
@ -9369,8 +9371,8 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
            r1 = fntorch(t0_full, t1, t2)
            self.assertEqual(r0, r1)

-    # ROCm 6.4 passes with tf32=on, but 6.4.1 needed tolerance reduced slightly
-    @tf32_on_and_off(0.002 if torch.version.hip else 0.001)
+    @skipIfRocmArch(MI300_ARCH)
+    @tf32_on_and_off(0.001)
    @reduced_f32_on_and_off(0.001)
    def test_broadcast_batched_matmul(self, device):
        n_dim = random.randint(1, 8)
@ -9707,7 +9709,8 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
        self.assertEqual((torch.tensor(1., device=device), torch.tensor(0., device=device)),
                         fn(torch.slogdet, (0, 0)))

-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @skipIfRocmArch(MI300_ARCH)
+    @tf32_on_and_off(0.005)
    @reduced_f32_on_and_off(0.07, 0.005)
    def test_tensordot(self, device):
        a = torch.arange(60., device=device).reshape(3, 4, 5)
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -32,7 +32,7 @@ from torch.nn import Buffer, Parameter
 from torch.nn.parallel._functions import Broadcast
 from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types
 from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, \
-    skipIfNoLapack, skipIfRocm, \
+    skipIfNoLapack, skipIfRocm, MI300_ARCH, skipIfRocmArch, \
    TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
    download_file, get_function_arglist, load_tests, skipIfMPS, \
    IS_PPC, \
@ -8378,8 +8378,9 @@ class TestNNDeviceType(NNTestCase):

    @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                     "Scipy v1.0 and/or numpy not found")
+    @skipIfRocmArch(MI300_ARCH)
    @expectedFailureMPS  # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
-    @tf32_on_and_off(0.01 if TEST_WITH_ROCM else 0.001)
+    @tf32_on_and_off(0.001)
    @reduced_f32_on_and_off(0.001)
    def test_affine_2d_rotate90(self, device):
        # scipy before 1.0.0 do not support homogeneous coordinate
@ -8526,8 +8527,9 @@ class TestNNDeviceType(NNTestCase):

    @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                     "Scipy v1.0 and/or numpy not found")
+    @skipIfRocmArch(MI300_ARCH)
    @expectedFailureMPS  # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @tf32_on_and_off(0.005)
    @reduced_f32_on_and_off(0.005)
    def test_affine_2d_rotateRandom(self, device):
        # scipy before 1.0.0 do not support homogeneous coordinate
@ -8579,7 +8581,8 @@ class TestNNDeviceType(NNTestCase):

    @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                     "Scipy v1.0 and/or numpy not found")
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @skipIfRocmArch(MI300_ARCH)
+    @tf32_on_and_off(0.005)
    @reduced_f32_on_and_off(0.005)
    def test_affine_3d_rotateRandom(self, device):
        # scipy before 1.0.0 do not support homogeneous coordinate
@ -9456,8 +9459,9 @@ class TestNNDeviceType(NNTestCase):
            unfold(inp)

    @onlyCUDA
+    @skipIfRocmArch(MI300_ARCH)
    @dtypes(torch.float, torch.double)
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @tf32_on_and_off(0.005)
    def test_rnn_fused(self, device, dtype):

        def copy_rnn(rnn1, rnn2):
@ -11936,10 +11940,11 @@ class TestNNDeviceType(NNTestCase):
        with self.assertRaisesRegex(RuntimeError, "log_probs tensor must not be empty"):
            F.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none')

+    @skipIfRocmArch(MI300_ARCH)
    @expectedFailureMPS  # RuntimeError: LSTM with projections is not currently supported with MPS.
    @dtypesIfCUDA(torch.half, torch.float, torch.double)
    @dtypes(torch.float)
-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @tf32_on_and_off(0.005)
    @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
    def test_variable_sequence(self, device, dtype):
        def pad(var, length):
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -2479,7 +2479,8 @@ class TestTorchDeviceType(TestCase):
                        self.assertEqual(x1.grad, x2.grad, rtol=0, atol=0.001)
                        self.assertEqual(y1.grad, y2.grad, rtol=0, atol=0.001)

-    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
+    @skipIfRocmArch(MI300_ARCH)
+    @tf32_on_and_off(0.005)
    @reduced_f32_on_and_off(0.08)
    def test_cdist_large(self, device):
        for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@ -24,6 +24,8 @@ from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_utils import (
    TEST_WITH_ROCM,
    skipIfRocm,
+    skipIfRocmArch,
+    MI300_ARCH,
    skipIfTorchDynamo,
    TEST_FAIRSEQ,
    run_tests,
@ -427,7 +429,8 @@ class TestTransformers(NNTestCase):
        # remove hook
        handle.remove()

-    @tf32_on_and_off(0.0021 if TEST_WITH_ROCM else 0.001)
+    @skipIfRocmArch(MI300_ARCH)
+    @tf32_on_and_off(0.001)
    @parametrize("use_torchscript", [False])
    @parametrize("enable_nested_tensor", [True, False])
    @parametrize("use_autocast", [True, False])
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@ -120,7 +120,9 @@ module_tests = [
        desc='no_bias',
        reference_fn=lambda i, p, _: torch.mm(i, p[0].t()),
        with_tf32=True,
-        tf32_precision=0.05 if TEST_WITH_ROCM else 0.005,
+        tf32_precision=0.005,
+        # ROCM: skipping tf32 test on gfx94 archs due to tolerance issue.
+        test_cuda=not (TEST_WITH_ROCM and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName),
        default_dtype=torch.double,
    ),
    dict(