From 2344eca5ebd378c2cd1e8c0373b9926da23baf8a Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 14 May 2025 13:15:03 +0000 Subject: [PATCH] Revert "Fix skipIfXpu and skipIfHpu disables tests when used on class (#151315)" This reverts commit ee096b89f63394b2c18826288783eef241f3959c. Reverted https://github.com/pytorch/pytorch/pull/151315 on behalf of https://github.com/jeanschmidt due to Seems to have introduced internal regressions, see [D74668899](https://www.internalfb.com/diff/D74668899). @malfet may you help the author get this PR merged? ([comment](https://github.com/pytorch/pytorch/pull/151315#issuecomment-2880203323)) --- test/distributed/test_functional_api.py | 32 ++++++++----------------- test/inductor/test_autoheuristic.py | 18 ++++---------- test/inductor/test_b2b_gemm.py | 4 ++-- test/inductor/test_layout_optim.py | 5 ++-- 4 files changed, 19 insertions(+), 40 deletions(-) diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py index c8b77bec1cc..5e6510f2c22 100644 --- a/test/distributed/test_functional_api.py +++ b/test/distributed/test_functional_api.py @@ -3,7 +3,6 @@ import sys import unittest from functools import partial, wraps -from unittest.mock import patch import torch import torch.distributed as dist @@ -11,7 +10,6 @@ import torch.distributed._functional_collectives as ft_c import torch.distributed.distributed_c10d as c10d import torch.distributed.tensor as dt from functorch import make_fx -from torch._dynamo.metrics_context import MetricsContext from torch._inductor.utils import run_and_get_code from torch.testing import FileCheck from torch.testing._internal.common_device_type import instantiate_device_type_tests @@ -33,6 +31,7 @@ from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, parametrize, run_tests, + skipIfHpu, TEST_CUDA, TEST_HPU, TestCase, @@ -91,7 +90,7 @@ def new_subgroups(group_size: int, pg_tag=None): return cur_subgroup, subgroups -@unittest.skipIf(TEST_HPU, "Unsupported on HPU") +@skipIfHpu class TestExpand(MultiThreadedTestCase): @property def world_size(self): @@ -181,7 +180,7 @@ class TestExpand(MultiThreadedTestCase): self.assertEqual(2, group_size) -@unittest.skipIf(TEST_HPU, "Unsupported on HPU") +@skipIfHpu class TestPgTag(MultiThreadedTestCase): @property def world_size(self): @@ -258,7 +257,7 @@ class TestPgTag(MultiThreadedTestCase): @instantiate_parametrized_tests -@unittest.skipIf(TEST_HPU, "Unsupported on HPU") +@skipIfHpu class TestTraceableCollectives(MultiThreadedTestCase): @property def world_size(self): @@ -404,7 +403,7 @@ class TestMetaCollectives(TestCase): self.assertEqual(x.size(), out.size()) -@unittest.skipIf(TEST_HPU, "Unsupported on HPU") +@skipIfHpu class TestGradCollectives(MultiThreadedTestCase): @property def world_size(self): @@ -657,7 +656,7 @@ class TestDistributedBackendCollectivesWithWorldSize4( @instantiate_parametrized_tests -@unittest.skipIf(TEST_HPU, "Unsupported on HPU") +@skipIfHpu class TestFunctionalAutograd(MultiThreadedTestCase): def setUp(self): super().setUp() @@ -667,13 +666,6 @@ class TestFunctionalAutograd(MultiThreadedTestCase): def world_size(self): return 2 - # `compilation_metric` attempts to update the `is_forward` field of `metrics_context`. Since - # `metrics_context` is a singleton, a runtime error will occur if multiple threads try to update it - # because `MetricsContext` does not allow updating existing fields when `overwrite` is False. - # So, we need to patch the `update` function of MetricsContext - def _metrics_context_update(self, *args, **kwargs) -> None: - pass - @parametrize("compile", [True, False]) def test_all_to_all_single(self, compile: bool = True) -> None: group = dist.group.WORLD.group_name @@ -699,8 +691,7 @@ class TestFunctionalAutograd(MultiThreadedTestCase): self.assertIsNotNone(out.grad_fn) self.assertTrue(out.requires_grad) loss = out.sum() - with patch.object(MetricsContext, "update", self._metrics_context_update): - loss.backward() + loss.backward() self.assertEqual(t.grad, torch.full_like(t, 2.0)) def test_all_to_all_single_inductor(self) -> None: @@ -720,8 +711,7 @@ class TestFunctionalAutograd(MultiThreadedTestCase): def run_with_backward(): out = compiled(t, self.world_size) - with patch.object(MetricsContext, "update", self._metrics_context_update): - out.backward() + out.backward() _, codes = run_and_get_code(run_with_backward) for code in codes: @@ -761,8 +751,7 @@ class TestFunctionalAutograd(MultiThreadedTestCase): gathered_tensor = compiled(local_tensor, dim) self.assertEqual(gathered_tensor, torch.ones(output_size)) - with patch.object(MetricsContext, "update", self._metrics_context_update): - gathered_tensor.sum().backward() + gathered_tensor.sum().backward() self.assertEqual( local_tensor.grad, torch.full((3, 3, 3), fill_value=float(self.world_size)), @@ -797,8 +786,7 @@ class TestFunctionalAutograd(MultiThreadedTestCase): rs_tensor = compiled(input_tensor, dim) res_num = 1 * group_size self.assertEqual(rs_tensor, torch.ones(input_size) * res_num) - with patch.object(MetricsContext, "update", self._metrics_context_update): - rs_tensor.sum().backward() + rs_tensor.sum().backward() self.assertEqual(input_tensor.grad, torch.full(output_size, fill_value=1.0)) diff --git a/test/inductor/test_autoheuristic.py b/test/inductor/test_autoheuristic.py index 4faed351215..27060808679 100644 --- a/test/inductor/test_autoheuristic.py +++ b/test/inductor/test_autoheuristic.py @@ -4,22 +4,17 @@ import unittest import torch import torch._inductor.config as inductor_config +from torch._dynamo.device_interface import get_interface_for_device from torch._inductor.autoheuristic.autoheuristic import AutoHeuristic, LocalFeedback from torch._inductor.autoheuristic.autoheuristic_utils import AHContext from torch._inductor.runtime.runtime_utils import cache_dir from torch._inductor.test_case import run_tests, TestCase from torch._inductor.utils import get_gpu_shared_memory -from torch.testing._internal.common_utils import TEST_XPU -from torch.testing._internal.inductor_utils import ( - GPU_TYPE, - HAS_CUDA, - HAS_GPU, - IS_A100, - IS_H100, -) +from torch.testing._internal.common_utils import skipIfXpu +from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_A100, IS_H100 -@unittest.skipIf(TEST_XPU, "AutoHeuristic doesn't currently work on the XPU stack") +@skipIfXpu(msg="AutoHeuristic doesn't currently work on the XPU stack") class AutoHeuristicTest(TestCase): def count_lines_in_file(self, file_path): with open(file_path) as file: @@ -107,9 +102,7 @@ class AutoHeuristicTest(TestCase): self.assertEqual(num_lines, 5) shared_memory = get_gpu_shared_memory() - - self.assertTrue(HAS_CUDA) - (fst, snd) = torch.cuda.get_device_capability() + (fst, snd) = get_interface_for_device(GPU_TYPE).get_device_capability() with open(path) as file: lines = file.readlines() @@ -158,7 +151,6 @@ class AutoHeuristicTest(TestCase): fx_graph_cache=False, fx_graph_remote_cache=False, ) - @unittest.skipIf(not IS_A100, "heuristic only run on A100") def test_global_feedback(self): self.run_mixed_mm() path = self.get_path_to_autoheuristic_log("mixed_mm") diff --git a/test/inductor/test_b2b_gemm.py b/test/inductor/test_b2b_gemm.py index 598339b9bdd..60bbfd6c492 100644 --- a/test/inductor/test_b2b_gemm.py +++ b/test/inductor/test_b2b_gemm.py @@ -6,11 +6,11 @@ import torch from torch._inductor.runtime.benchmarking import benchmarker from torch._inductor.test_case import run_tests, TestCase from torch._inductor.utils import run_and_get_code -from torch.testing._internal.common_utils import TEST_XPU +from torch.testing._internal.common_utils import skipIfXpu from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU -@unittest.skipIf(TEST_XPU, "Segmentation fault on CI machine") +@skipIfXpu(msg="Segmentation fault on CI machine") class B2BGEMMTest(TestCase): device = GPU_TYPE diff --git a/test/inductor/test_layout_optim.py b/test/inductor/test_layout_optim.py index 2aeaa9469cd..52203caddab 100644 --- a/test/inductor/test_layout_optim.py +++ b/test/inductor/test_layout_optim.py @@ -2,7 +2,6 @@ import copy import os import random -import unittest import torch from torch import nn @@ -10,7 +9,7 @@ from torch._dynamo.utils import same from torch._inductor import config from torch._inductor.test_case import run_tests, TestCase from torch.testing._internal.common_cuda import tf32_off -from torch.testing._internal.common_utils import TEST_XPU +from torch.testing._internal.common_utils import skipIfXpu from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU @@ -35,7 +34,7 @@ class Model2Conv(nn.Module): return (torch.rand(2, 3, 16, 16),) -@unittest.skipIf(TEST_XPU, "ccl doesn't currently work on the XPU stack") +@skipIfXpu(msg="ccl doesn't currently work on the XPU stack") class TestLayoutOptim(TestCase): @classmethod def setUpClass(cls):