mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Stop skipping entire foreach tests, just skip the profiler portion (#156871)
Instead of skipping the whole test as the CUPTI team figures out what is wrong, let's temporarily skip the profiler check portion. It is high pri to add it back to ensure foreach ops are actually performant. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156871 Approved by: https://github.com/albanD ghstack dependencies: #156876
This commit is contained in:
parent
20e40492b0
commit
5a0926a26e
|
|
@ -12,7 +12,7 @@ from numbers import Number
|
|||
import torch
|
||||
from torch.testing import make_tensor
|
||||
from torch.testing._comparison import default_tolerances
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
|
||||
from torch.testing._internal.common_device_type import (
|
||||
dtypes,
|
||||
instantiate_device_type_tests,
|
||||
|
|
@ -20,7 +20,6 @@ from torch.testing._internal.common_device_type import (
|
|||
onlyCUDA,
|
||||
OpDTypes,
|
||||
ops,
|
||||
skipCUDAVersionIn,
|
||||
)
|
||||
from torch.testing._internal.common_dtype import (
|
||||
all_types_and_complex_and,
|
||||
|
|
@ -80,8 +79,13 @@ class ForeachFuncWrapper:
|
|||
def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
|
||||
actual = None
|
||||
zero_size = kwargs.pop("zero_size", False)
|
||||
|
||||
# Skip profiler check for CUDA 12.6, 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681. TODO: ADD IT BACK!!!
|
||||
skip_profiler_check = _get_torch_cuda_version() in [(12, 6), (12, 8)]
|
||||
if (
|
||||
is_cuda
|
||||
and not skip_profiler_check
|
||||
and torch.autograd.kineto_available()
|
||||
and torch.profiler.ProfilerActivity.CUDA
|
||||
in torch.profiler.supported_activities()
|
||||
|
|
@ -92,6 +96,7 @@ class ForeachFuncWrapper:
|
|||
torch.cuda.synchronize()
|
||||
keys = tuple([e.key for e in p.key_averages()])
|
||||
mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
|
||||
|
||||
assert mta_called == (expect_fastpath and (not zero_size)), (
|
||||
f"{mta_called=}, {expect_fastpath=}, {zero_size=}, {self.func.__name__=}, {keys=}"
|
||||
)
|
||||
|
|
@ -191,9 +196,6 @@ class TestForeach(TestCase):
|
|||
zero_size=True,
|
||||
)
|
||||
|
||||
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681
|
||||
@skipCUDAVersionIn([(12, 8)])
|
||||
@skipIfRocmVersionLessThan((6, 0))
|
||||
@ops(
|
||||
foreach_unary_op_db
|
||||
|
|
@ -306,9 +308,6 @@ class TestForeach(TestCase):
|
|||
else:
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681
|
||||
@skipCUDAVersionIn([(12, 8)])
|
||||
@ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
|
||||
@parametrize("is_fastpath", (True, False))
|
||||
def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
|
||||
|
|
@ -366,9 +365,6 @@ class TestForeach(TestCase):
|
|||
|
||||
@ops(foreach_pointwise_op_db)
|
||||
@parametrize("is_fastpath", (True, False))
|
||||
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681
|
||||
@skipCUDAVersionIn([(12, 8)])
|
||||
def test_pointwise_op_with_tensor_of_scalarlist_overload(
|
||||
self, device, dtype, op, is_fastpath
|
||||
):
|
||||
|
|
@ -706,9 +702,6 @@ class TestForeach(TestCase):
|
|||
):
|
||||
foreach_op_([tensor1], [tensor2])
|
||||
|
||||
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681
|
||||
@skipCUDAVersionIn([(12, 8)])
|
||||
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
|
||||
@ops(
|
||||
filter(lambda op: op.supports_out, foreach_binary_op_db),
|
||||
|
|
@ -824,9 +817,6 @@ class TestForeach(TestCase):
|
|||
scalar_self_arg=False,
|
||||
)
|
||||
|
||||
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681
|
||||
@skipCUDAVersionIn([(12, 8)])
|
||||
@ops(
|
||||
filter(lambda op: op.supports_out, foreach_binary_op_db),
|
||||
dtypes=floating_types_and(torch.half, torch.bfloat16),
|
||||
|
|
@ -1350,9 +1340,6 @@ class TestForeach(TestCase):
|
|||
copy_(t, s, non_blocking)
|
||||
self.assertEqual(ref_input, sample.input)
|
||||
|
||||
# Skip CUDA version 12.8 as the upgrade makes profiler results flaky
|
||||
# https://github.com/pytorch/pytorch/issues/148681
|
||||
@skipCUDAVersionIn([(12, 8)])
|
||||
@onlyCUDA
|
||||
@ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
|
||||
def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user