Stop skipping entire foreach tests, just skip the profiler portion (#156871)

Instead of skipping the whole test as the CUPTI team figures out what is wrong, let's temporarily skip the profiler check portion. It is high pri to add it back to ensure foreach ops are actually performant. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156871 Approved by: https://github.com/albanD ghstack dependencies: #156876
2025-12-06 12:20:52 +01:00 · 2025-06-27 09:26:46 -07:00 · 2025-06-27 09:26:46 -07:00 · 5a0926a26e
commit 5a0926a26e
parent 20e40492b0
1 changed files with 7 additions and 20 deletions
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@ -12,7 +12,7 @@ from numbers import Number
 import torch
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
    dtypes,
    instantiate_device_type_tests,
@ -20,7 +20,6 @@ from torch.testing._internal.common_device_type import (
    onlyCUDA,
    OpDTypes,
    ops,
-    skipCUDAVersionIn,
 )
 from torch.testing._internal.common_dtype import (
    all_types_and_complex_and,
@ -80,8 +79,13 @@ class ForeachFuncWrapper:
    def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
        actual = None
        zero_size = kwargs.pop("zero_size", False)
+
+        # Skip profiler check for CUDA 12.6, 12.8 as the upgrade makes profiler results flaky
+        # https://github.com/pytorch/pytorch/issues/148681. TODO: ADD IT BACK!!!
+        skip_profiler_check = _get_torch_cuda_version() in [(12, 6), (12, 8)]
        if (
            is_cuda
+            and not skip_profiler_check
            and torch.autograd.kineto_available()
            and torch.profiler.ProfilerActivity.CUDA
            in torch.profiler.supported_activities()
@ -92,6 +96,7 @@ class ForeachFuncWrapper:
                torch.cuda.synchronize()
            keys = tuple([e.key for e in p.key_averages()])
            mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
+
            assert mta_called == (expect_fastpath and (not zero_size)), (
                f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
            )
@ -191,9 +196,6 @@ class TestForeach(TestCase):
                        zero_size=True,
                    )

-    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
-    # https://github.com/pytorch/pytorch/issues/148681
-    @skipCUDAVersionIn([(12, 8)])
    @skipIfRocmVersionLessThan((6, 0))
    @ops(
        foreach_unary_op_db
@ -306,9 +308,6 @@ class TestForeach(TestCase):
                else:
                    self.assertEqual(expected, actual)

-    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
-    # https://github.com/pytorch/pytorch/issues/148681
-    @skipCUDAVersionIn([(12, 8)])
    @ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
    @parametrize("is_fastpath", (True, False))
    def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
@ -366,9 +365,6 @@ class TestForeach(TestCase):

    @ops(foreach_pointwise_op_db)
    @parametrize("is_fastpath", (True, False))
-    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
-    # https://github.com/pytorch/pytorch/issues/148681
-    @skipCUDAVersionIn([(12, 8)])
    def test_pointwise_op_with_tensor_of_scalarlist_overload(
        self, device, dtype, op, is_fastpath
    ):
@ -706,9 +702,6 @@ class TestForeach(TestCase):
                ):
                    foreach_op_([tensor1], [tensor2])

-    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
-    # https://github.com/pytorch/pytorch/issues/148681
-    @skipCUDAVersionIn([(12, 8)])
    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
    @ops(
        filter(lambda op: op.supports_out, foreach_binary_op_db),
@ -824,9 +817,6 @@ class TestForeach(TestCase):
            scalar_self_arg=False,
        )

-    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
-    # https://github.com/pytorch/pytorch/issues/148681
-    @skipCUDAVersionIn([(12, 8)])
    @ops(
        filter(lambda op: op.supports_out, foreach_binary_op_db),
        dtypes=floating_types_and(torch.half, torch.bfloat16),
@ -1350,9 +1340,6 @@ class TestForeach(TestCase):
                        copy_(t, s, non_blocking)
                    self.assertEqual(ref_input, sample.input)

-    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
-    # https://github.com/pytorch/pytorch/issues/148681
-    @skipCUDAVersionIn([(12, 8)])
    @onlyCUDA
    @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
    def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):