[ROCm][CI] remove relaxed tolerance for tf32 tests (#166478)

Instead of relaxing tolerances for certain unit tests that exercise TF32 on MI300, skip the tests until hipblaslt accuracy is improved.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166478
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
Co-authored-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
This commit is contained in:
Jeff Daily 2025-10-31 16:15:39 +00:00 committed by PyTorch MergeBot
parent 1e3600b528
commit c3b71d5499
7 changed files with 36 additions and 17 deletions

View File

@ -47,9 +47,11 @@ from torch.testing._internal.common_utils import (
gradgradcheck,
instantiate_parametrized_tests,
MACOS_VERSION,
MI300_ARCH,
parametrize as parametrize_test,
run_tests,
set_default_dtype,
skipIfRocmArch,
subtest,
TEST_SCIPY,
TEST_WITH_ROCM,
@ -3393,8 +3395,9 @@ class TestConvolutionNNDeviceType(NNTestCase):
F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device))
F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))
@skipIfRocmArch(MI300_ARCH)
@onlyCUDA
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@tf32_on_and_off(0.005)
def test_Conv2d_size_1_kernel(self, device):
x_cpu = torch.randn(2, 3, 5, 5)
conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
@ -3425,8 +3428,9 @@ class TestConvolutionNNDeviceType(NNTestCase):
exact_device=False,
)
@skipIfRocmArch(MI300_ARCH)
@onlyCUDA
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@tf32_on_and_off(0.005)
def test_ConvTranspose2d_size_1_kernel(self, device):
x_cpu = torch.randn(2, 3, 5, 5)
conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)

View File

@ -6967,7 +6967,8 @@ class TestCompileKernel(TestCase):
with self.assertRaises(RuntimeError):
kernel.set_shared_memory_config(excessive_shared_mem)
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@skipIfRocmArch(MI300_ARCH)
@tf32_on_and_off(0.005)
@unittest.skipIf(not TEST_CUDA, "No CUDA")
def test_compile_kernel_advanced(self):
# Test matrix multiplication

View File

@ -755,10 +755,11 @@ class TestLinalg(TestCase):
cholesky_test_helper(3, batchsize, upper)
@precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4})
@skipIfRocmArch(MI300_ARCH)
@skipCUDAIfNoMagma
@skipCPUIfNoLapack
@dtypes(*floating_and_complex_types())
@tf32_on_and_off(0.1 if TEST_WITH_ROCM else 0.01)
@tf32_on_and_off(0.01)
@reduced_f32_on_and_off(0.01)
def test_old_cholesky(self, device, dtype):
from torch.testing._internal.common_utils import random_hermitian_pd_matrix
@ -7410,9 +7411,10 @@ scipy_lobpcg | {eq_err_scipy:10.2e} | {eq_err_general_scipy:10.2e} | {iters2:
def test_addmm_gelu(self, device, dtype):
self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)
@skipIfRocmArch(MI300_ARCH)
@dtypes(torch.float, torch.double)
@dtypesIfCUDA(*floating_and_complex_types())
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@tf32_on_and_off(0.005)
@reduced_f32_on_and_off(0.005)
def test_addmm_sizes(self, device, dtype):
for m in [0, 1, 25]:
@ -9369,8 +9371,8 @@ scipy_lobpcg | {eq_err_scipy:10.2e} | {eq_err_general_scipy:10.2e} | {iters2:
r1 = fntorch(t0_full, t1, t2)
self.assertEqual(r0, r1)
# ROCm 6.4 passes with tf32=on, but 6.4.1 needed tolerance reduced slightly
@tf32_on_and_off(0.002 if torch.version.hip else 0.001)
@skipIfRocmArch(MI300_ARCH)
@tf32_on_and_off(0.001)
@reduced_f32_on_and_off(0.001)
def test_broadcast_batched_matmul(self, device):
n_dim = random.randint(1, 8)
@ -9707,7 +9709,8 @@ scipy_lobpcg | {eq_err_scipy:10.2e} | {eq_err_general_scipy:10.2e} | {iters2:
self.assertEqual((torch.tensor(1., device=device), torch.tensor(0., device=device)),
fn(torch.slogdet, (0, 0)))
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@skipIfRocmArch(MI300_ARCH)
@tf32_on_and_off(0.005)
@reduced_f32_on_and_off(0.07, 0.005)
def test_tensordot(self, device):
a = torch.arange(60., device=device).reshape(3, 4, 5)

View File

@ -32,7 +32,7 @@ from torch.nn import Buffer, Parameter
from torch.nn.parallel._functions import Broadcast
from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types
from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, \
skipIfNoLapack, skipIfRocm, \
skipIfNoLapack, skipIfRocm, MI300_ARCH, skipIfRocmArch, \
TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
download_file, get_function_arglist, load_tests, skipIfMPS, \
IS_PPC, \
@ -8378,8 +8378,9 @@ class TestNNDeviceType(NNTestCase):
@unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
"Scipy v1.0 and/or numpy not found")
@skipIfRocmArch(MI300_ARCH)
@expectedFailureMPS # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
@tf32_on_and_off(0.01 if TEST_WITH_ROCM else 0.001)
@tf32_on_and_off(0.001)
@reduced_f32_on_and_off(0.001)
def test_affine_2d_rotate90(self, device):
# scipy before 1.0.0 do not support homogeneous coordinate
@ -8526,8 +8527,9 @@ class TestNNDeviceType(NNTestCase):
@unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
"Scipy v1.0 and/or numpy not found")
@skipIfRocmArch(MI300_ARCH)
@expectedFailureMPS # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@tf32_on_and_off(0.005)
@reduced_f32_on_and_off(0.005)
def test_affine_2d_rotateRandom(self, device):
# scipy before 1.0.0 do not support homogeneous coordinate
@ -8579,7 +8581,8 @@ class TestNNDeviceType(NNTestCase):
@unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
"Scipy v1.0 and/or numpy not found")
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@skipIfRocmArch(MI300_ARCH)
@tf32_on_and_off(0.005)
@reduced_f32_on_and_off(0.005)
def test_affine_3d_rotateRandom(self, device):
# scipy before 1.0.0 do not support homogeneous coordinate
@ -9456,8 +9459,9 @@ class TestNNDeviceType(NNTestCase):
unfold(inp)
@onlyCUDA
@skipIfRocmArch(MI300_ARCH)
@dtypes(torch.float, torch.double)
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@tf32_on_and_off(0.005)
def test_rnn_fused(self, device, dtype):
def copy_rnn(rnn1, rnn2):
@ -11936,10 +11940,11 @@ class TestNNDeviceType(NNTestCase):
with self.assertRaisesRegex(RuntimeError, "log_probs tensor must not be empty"):
F.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none')
@skipIfRocmArch(MI300_ARCH)
@expectedFailureMPS # RuntimeError: LSTM with projections is not currently supported with MPS.
@dtypesIfCUDA(torch.half, torch.float, torch.double)
@dtypes(torch.float)
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@tf32_on_and_off(0.005)
@skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
def test_variable_sequence(self, device, dtype):
def pad(var, length):

View File

@ -2479,7 +2479,8 @@ class TestTorchDeviceType(TestCase):
self.assertEqual(x1.grad, x2.grad, rtol=0, atol=0.001)
self.assertEqual(y1.grad, y2.grad, rtol=0, atol=0.001)
@tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
@skipIfRocmArch(MI300_ARCH)
@tf32_on_and_off(0.005)
@reduced_f32_on_and_off(0.08)
def test_cdist_large(self, device):
for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:

View File

@ -24,6 +24,8 @@ from torch.testing._internal.common_nn import NNTestCase
from torch.testing._internal.common_utils import (
TEST_WITH_ROCM,
skipIfRocm,
skipIfRocmArch,
MI300_ARCH,
skipIfTorchDynamo,
TEST_FAIRSEQ,
run_tests,
@ -427,7 +429,8 @@ class TestTransformers(NNTestCase):
# remove hook
handle.remove()
@tf32_on_and_off(0.0021 if TEST_WITH_ROCM else 0.001)
@skipIfRocmArch(MI300_ARCH)
@tf32_on_and_off(0.001)
@parametrize("use_torchscript", [False])
@parametrize("enable_nested_tensor", [True, False])
@parametrize("use_autocast", [True, False])

View File

@ -120,7 +120,9 @@ module_tests = [
desc='no_bias',
reference_fn=lambda i, p, _: torch.mm(i, p[0].t()),
with_tf32=True,
tf32_precision=0.05 if TEST_WITH_ROCM else 0.005,
tf32_precision=0.005,
# ROCM: skipping tf32 test on gfx94 archs due to tolerance issue.
test_cuda=not (TEST_WITH_ROCM and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName),
default_dtype=torch.double,
),
dict(