Stop skipping entire foreach tests, just skip the profiler portion (#156871)

Instead of skipping the whole test as the CUPTI team figures out what is wrong, let's temporarily skip the profiler check portion. It is high pri to add it back to ensure foreach ops are actually performant.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156871
Approved by: https://github.com/albanD
ghstack dependencies: #156876
This commit is contained in:
Jane Xu 2025-06-27 09:26:46 -07:00 committed by PyTorch MergeBot
parent 20e40492b0
commit 5a0926a26e

View File

@ -12,7 +12,7 @@ from numbers import Number
import torch import torch
from torch.testing import make_tensor from torch.testing import make_tensor
from torch.testing._comparison import default_tolerances from torch.testing._comparison import default_tolerances
from torch.testing._internal.common_cuda import TEST_MULTIGPU from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
from torch.testing._internal.common_device_type import ( from torch.testing._internal.common_device_type import (
dtypes, dtypes,
instantiate_device_type_tests, instantiate_device_type_tests,
@ -20,7 +20,6 @@ from torch.testing._internal.common_device_type import (
onlyCUDA, onlyCUDA,
OpDTypes, OpDTypes,
ops, ops,
skipCUDAVersionIn,
) )
from torch.testing._internal.common_dtype import ( from torch.testing._internal.common_dtype import (
all_types_and_complex_and, all_types_and_complex_and,
@ -80,8 +79,13 @@ class ForeachFuncWrapper:
def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs): def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
actual = None actual = None
zero_size = kwargs.pop("zero_size", False) zero_size = kwargs.pop("zero_size", False)
# Skip profiler check for CUDA 12.6, 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681. TODO: ADD IT BACK!!!
skip_profiler_check = _get_torch_cuda_version() in [(12, 6), (12, 8)]
if ( if (
is_cuda is_cuda
and not skip_profiler_check
and torch.autograd.kineto_available() and torch.autograd.kineto_available()
and torch.profiler.ProfilerActivity.CUDA and torch.profiler.ProfilerActivity.CUDA
in torch.profiler.supported_activities() in torch.profiler.supported_activities()
@ -92,6 +96,7 @@ class ForeachFuncWrapper:
torch.cuda.synchronize() torch.cuda.synchronize()
keys = tuple([e.key for e in p.key_averages()]) keys = tuple([e.key for e in p.key_averages()])
mta_called = any("multi_tensor_apply_kernel" in k for k in keys) mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
assert mta_called == (expect_fastpath and (not zero_size)), ( assert mta_called == (expect_fastpath and (not zero_size)), (
f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}" f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
) )
@ -191,9 +196,6 @@ class TestForeach(TestCase):
zero_size=True, zero_size=True,
) )
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@skipIfRocmVersionLessThan((6, 0)) @skipIfRocmVersionLessThan((6, 0))
@ops( @ops(
foreach_unary_op_db foreach_unary_op_db
@ -306,9 +308,6 @@ class TestForeach(TestCase):
else: else:
self.assertEqual(expected, actual) self.assertEqual(expected, actual)
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db)) @ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
@parametrize("is_fastpath", (True, False)) @parametrize("is_fastpath", (True, False))
def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath): def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
@ -366,9 +365,6 @@ class TestForeach(TestCase):
@ops(foreach_pointwise_op_db) @ops(foreach_pointwise_op_db)
@parametrize("is_fastpath", (True, False)) @parametrize("is_fastpath", (True, False))
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
def test_pointwise_op_with_tensor_of_scalarlist_overload( def test_pointwise_op_with_tensor_of_scalarlist_overload(
self, device, dtype, op, is_fastpath self, device, dtype, op, is_fastpath
): ):
@ -706,9 +702,6 @@ class TestForeach(TestCase):
): ):
foreach_op_([tensor1], [tensor2]) foreach_op_([tensor1], [tensor2])
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not found") @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
@ops( @ops(
filter(lambda op: op.supports_out, foreach_binary_op_db), filter(lambda op: op.supports_out, foreach_binary_op_db),
@ -824,9 +817,6 @@ class TestForeach(TestCase):
scalar_self_arg=False, scalar_self_arg=False,
) )
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@ops( @ops(
filter(lambda op: op.supports_out, foreach_binary_op_db), filter(lambda op: op.supports_out, foreach_binary_op_db),
dtypes=floating_types_and(torch.half, torch.bfloat16), dtypes=floating_types_and(torch.half, torch.bfloat16),
@ -1350,9 +1340,6 @@ class TestForeach(TestCase):
copy_(t, s, non_blocking) copy_(t, s, non_blocking)
self.assertEqual(ref_input, sample.input) self.assertEqual(ref_input, sample.input)
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
# https://github.com/pytorch/pytorch/issues/148681
@skipCUDAVersionIn([(12, 8)])
@onlyCUDA @onlyCUDA
@ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db)) @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
def test_foreach_copy_with_multi_dtypes(self, device, dtype, op): def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):