Stop skipping entire foreach tests, just skip the profiler portion (#156871)

Instead of skipping the whole test as the CUPTI team figures out what is wrong, let's temporarily skip the profiler check portion. It is high pri to add it back to ensure foreach ops are actually performant.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156871
Approved by: https://github.com/albanD
ghstack dependencies: #156876
This commit is contained in:
Jane Xu 2025-06-27 09:26:46 -07:00 committed by PyTorch MergeBot
parent 20e40492b0
commit 5a0926a26e

View File

@ -12,7 +12,7 @@ from numbers import Number
import torch
from torch.testing import make_tensor
from torch.testing._comparison import default_tolerances
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
from torch.testing._internal.common_device_type import (
dtypes,
instantiate_device_type_tests,
@ -20,7 +20,6 @@ from torch.testing._internal.common_device_type import (
onlyCUDA,
OpDTypes,
ops,
skipCUDAVersionIn,
)
from torch.testing._internal.common_dtype import (
all_types_and_complex_and,
@ -80,8 +79,13 @@ class ForeachFuncWrapper:
def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
actual = None
zero_size = kwargs.pop("zero_size", False)
# Skip profiler check for CUDA 12.6, 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681. TODO: ADD IT BACK!!!
skip_profiler_check = _get_torch_cuda_version() in [(12, 6), (12, 8)]
if (
is_cuda
and not skip_profiler_check
and torch.autograd.kineto_available()
and torch.profiler.ProfilerActivity.CUDA
in torch.profiler.supported_activities()
@ -92,6 +96,7 @@ class ForeachFuncWrapper:
torch.cuda.synchronize()
keys = tuple([e.key for e in p.key_averages()])
mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
assert mta_called == (expect_fastpath and (not zero_size)), (
f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
)
@ -191,9 +196,6 @@ class TestForeach(TestCase):
zero_size=True,
)
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@skipIfRocmVersionLessThan((6, 0))
@ops(
foreach_unary_op_db
@ -306,9 +308,6 @@ class TestForeach(TestCase):
else:
self.assertEqual(expected, actual)
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
@parametrize("is_fastpath", (True, False))
def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
@ -366,9 +365,6 @@ class TestForeach(TestCase):
@ops(foreach_pointwise_op_db)
@parametrize("is_fastpath", (True, False))
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
def test_pointwise_op_with_tensor_of_scalarlist_overload(
self, device, dtype, op, is_fastpath
):
@ -706,9 +702,6 @@ class TestForeach(TestCase):
):
foreach_op_([tensor1], [tensor2])
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
@ops(
filter(lambda op: op.supports_out, foreach_binary_op_db),
@ -824,9 +817,6 @@ class TestForeach(TestCase):
scalar_self_arg=False,
)
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@ops(
filter(lambda op: op.supports_out, foreach_binary_op_db),
dtypes=floating_types_and(torch.half, torch.bfloat16),
@ -1350,9 +1340,6 @@ class TestForeach(TestCase):
copy_(t, s, non_blocking)
self.assertEqual(ref_input, sample.input)
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@onlyCUDA
@ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):