diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 6e6523a636a..944da4c39b7 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -497,7 +497,7 @@ docker build \ --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ --build-arg "KATEX=${KATEX:-}" \ --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ - --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a}" \ + --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \ --build-arg "IMAGE_NAME=${IMAGE_NAME}" \ --build-arg "UCX_COMMIT=${UCX_COMMIT}" \ --build-arg "UCC_COMMIT=${UCC_COMMIT}" \ diff --git a/.github/actions/diskspace-cleanup/action.yml b/.github/actions/diskspace-cleanup/action.yml index b6ef55f5792..7291adb59a1 100644 --- a/.github/actions/diskspace-cleanup/action.yml +++ b/.github/actions/diskspace-cleanup/action.yml @@ -17,6 +17,10 @@ runs: set -ex diskspace_cutoff=${{ inputs.diskspace-cutoff }} docker_root_dir=$(docker info -f '{{.DockerRootDir}}') + if [ ! -d "$docker_root_dir" ]; then + echo "Docker root directory ($docker_root_dir) does not exist. Skipping disk space check." + exit 0 + fi diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //') msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml index 04ce65f2e0a..66d58734adb 100644 --- a/.github/actions/setup-rocm/action.yml +++ b/.github/actions/setup-rocm/action.yml @@ -5,20 +5,6 @@ description: Set up ROCm host for CI runs: using: composite steps: - - name: Set DOCKER_HOST - shell: bash - run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" - - - name: Remove leftover Docker config file - shell: bash - continue-on-error: true - run: | - set -ex - - cat ~/.docker/config.json || true - # https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not - rm -f ~/.docker/config.json - - name: Stop all running docker containers if: always() shell: bash @@ -111,8 +97,10 @@ runs: shell: bash run: | # All GPUs are visible to the runner; visibility, if needed, will be set by run_test.py. + # Add render group for container creation. + render_gid=`cat /etc/group | grep render | cut -d: -f3` # The --group-add daemon and --group-add bin are needed in the Ubuntu 24.04 and Almalinux OSs respectively. # This is due to the device files (/dev/kfd & /dev/dri) being owned by video group on bare metal. # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries. # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary. - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon --group-add bin" >> "${GITHUB_ENV}" + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device /dev/dri --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}" diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 39a8ef12364..ee120d7d40b 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -36,12 +36,12 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" }, + { config: "default", shard: 1, num_shards: 6, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "default", shard: 2, num_shards: 6, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "default", shard: 3, num_shards: 6, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "default", shard: 4, num_shards: 6, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "default", shard: 5, num_shards: 6, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "default", shard: 6, num_shards: 6, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, ]} secrets: inherit diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 760f9e4354e..4991793ee36 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -175,9 +175,9 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" }, - { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" }, + { config: "default", shard: 1, num_shards: 2, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "default", shard: 2, num_shards: 2, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.2' || 'linux.rocm.gpu.2' }}" }, + { config: "distributed", shard: 1, num_shards: 1, runner: "${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' && 'linux.rocm.gpu.mi300.4' || 'linux.rocm.gpu.4' }}" }, ]} secrets: inherit diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py index 2d8c96a0a1a..f3ee159f974 100644 --- a/test/distributed/_tools/test_sac_ilp.py +++ b/test/distributed/_tools/test_sac_ilp.py @@ -19,7 +19,13 @@ from torch.distributed._tools.sac_ilp import ( sac_milp, ) from torch.testing._internal.common_cuda import TEST_CUDA -from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase +from torch.testing._internal.common_utils import ( + MI300_ARCH, + run_tests, + skipIfRocmArch, + skipIfTorchDynamo, + TestCase, +) from torch.testing._internal.distributed._tensor.common_dtensor import ( ModelArgs, Transformer, @@ -131,6 +137,7 @@ class TestSACILP(TestCase): @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653") @unittest.skipIf(not TEST_CUDA, "CUDA not available") + @skipIfRocmArch(MI300_ARCH) def test_sac_ilp_case1(self): """ This is a case where the memory budget is either binding or too tight, diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py index c88677cf069..dac6ab61979 100644 --- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py +++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py @@ -29,10 +29,9 @@ from torch.distributed.tensor.parallel import ( ) from torch.testing._internal.common_utils import ( # type: ignore[attr-defined] instantiate_parametrized_tests, - MI300_ARCH, parametrize, run_tests, - runOnRocmArch, + skipIfRocm, TestCase, ) from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule @@ -241,7 +240,7 @@ class MicroPipelineTPTest(TestCase): self.assertNotIn("all_gather_into_tensor", code) self.assertEqual("return_A=True" in code, return_A) - @runOnRocmArch(MI300_ARCH) + @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @parametrize("A_dims", [2, 3]) @parametrize("gather_dim", [0, 1, 2]) @@ -344,7 +343,7 @@ class MicroPipelineTPTest(TestCase): self.assertIn("fused_matmul_reduce_scatter", code) self.assertNotIn("reduce_scatter_tensor", code) - @runOnRocmArch(MI300_ARCH) + @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @parametrize("A_dims", [2, 3]) @parametrize("scatter_dim", [0, 1, 2]) diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py index 1cd8629e04c..aec59c27669 100644 --- a/test/distributed/test_c10d_gloo.py +++ b/test/distributed/test_c10d_gloo.py @@ -49,9 +49,11 @@ from torch.testing._internal.common_distributed import ( verify_ddp_error_logged, ) from torch.testing._internal.common_utils import ( + MI300_ARCH, retry_on_connect_failures, run_tests, skip_but_pass_in_sandcastle, + skipIfRocmArch, TestCase, ) @@ -1097,6 +1099,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): self._test_gather_stress(inputs, lambda t: t.clone()) @skip_if_lt_x_gpu(2) + @skipIfRocmArch(MI300_ARCH) @requires_gloo() def test_gather_stress_cuda(self): inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py index 9c0060295b0..250434a635e 100644 --- a/test/inductor/test_cpu_repro.py +++ b/test/inductor/test_cpu_repro.py @@ -36,6 +36,7 @@ from torch.testing._internal.common_utils import ( skipIfRocm, slowTest, TEST_MKL, + TEST_WITH_ROCM, xfailIfS390X, ) from torch.utils._python_dispatch import TorchDispatchMode @@ -583,6 +584,12 @@ class CPUReproTests(TestCase): batch_size, seq_len, ): + if ( + TEST_WITH_ROCM + and not unbatched + and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName.split(":")[0] + ): + self.skipTest("Flaky on MI300 with unbatched=False") self._test_lstm_packed( unbatched, input_size, diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py index 8d11ec24406..d52a3b0a33e 100644 --- a/test/inductor/test_fp8.py +++ b/test/inductor/test_fp8.py @@ -2,6 +2,7 @@ import functools import unittest +from typing import List, Tuple, Union import torch from torch import Tensor @@ -87,6 +88,33 @@ def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype): return x_fp8, inverse_scale +def _fix_fp8_dtype_for_rocm( + dtype: Union[torch.dtype, List[torch.dtype], Tuple[torch.dtype]], device +) -> Union[torch.dtype, List[torch.dtype], Tuple[torch.dtype]]: + # This function is used to change FP8 data types + # with MI300 supported FP8 types if device is GPU: + # e4m3fn -> e4m3fnuz + # e5m2 -> e5m2fnuz + # Supports single, typle and list of dtypes + # Keeps the same test name for CUDA and ROCm + # Also it allows to enable FP8 inductor tests for CPU + if ( + torch.version.hip + and ("cuda" in device) + and ("gfx94" in torch.cuda.get_device_properties(0).gcnArchName.split(":")[0]) + ): + # MI300 uses different float8 dtypes + if isinstance(dtype, tuple): + return tuple(_fix_fp8_dtype_for_rocm(x, device) for x in dtype) + if isinstance(dtype, list): + return [_fix_fp8_dtype_for_rocm(x, device) for x in dtype] + if dtype == torch.float8_e4m3fn: + return torch.float8_e4m3fnuz + elif dtype == torch.float8_e5m2: + return torch.float8_e5m2fnuz + return dtype + + @instantiate_parametrized_tests class TestFP8Types(TestCase): @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) @@ -116,9 +144,8 @@ class TestFP8Types(TestCase): def test_eager_fallback(self, dtype: torch.dtype): weight_shape = (32, 16) - e4m3_type = ( - torch.float8_e4m3fn if torch.version.hip is None else torch.float8_e4m3fnuz - ) + e4m3_type = torch.float8_e4m3fn + e4m3_type = _fix_fp8_dtype_for_rocm(e4m3_type, device="cuda") def fp8_matmul_unwrapped(x): a_scale = torch.Tensor([1.0]).to(device="cuda") @@ -156,13 +183,9 @@ class TestFP8Types(TestCase): @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float)) @parametrize("shape", ("15,3,13", "4,2048,4096")) - @parametrize( - "dst_types", - [(torch.float8_e4m3fn, torch.float8_e5m2)] - if torch.version.hip is None - else [(torch.float8_e4m3fnuz, torch.float8_e5m2fnuz)], - ) + @parametrize("dst_types", [(torch.float8_e4m3fn, torch.float8_e5m2)]) def test_valid_cast(self, dtype: torch.dtype, shape: str, dst_types: tuple): + dst_types = _fix_fp8_dtype_for_rocm(dst_types, device="cuda") e4m3, e5m2 = dst_types def fp8_cast(x): @@ -204,16 +227,13 @@ class TestFP8Types(TestCase): @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) @parametrize("src_dtype", (torch.float16, torch.bfloat16, torch.float)) - @parametrize( - "dst_dtype", - (torch.float8_e4m3fn, torch.float8_e5m2) - if torch.version.hip is None - else (torch.float8_e4m3fnuz, torch.float8_e5m2fnuz), - ) + @parametrize("dst_dtype", (torch.float8_e4m3fn, torch.float8_e5m2)) @parametrize("shape", ("16,16,16", "4,2048,4096")) def test_to_fp8_saturated( self, src_dtype: torch.dtype, dst_dtype: torch.dtype, shape: str ): + dst_dtype = _fix_fp8_dtype_for_rocm(dst_dtype, device="cuda") + def fp8_saturated(x, dtype): return _to_fp8_saturated(x, dtype) @@ -229,14 +249,10 @@ class TestFP8Types(TestCase): @unittest.skipIf(TEST_WITH_ROCM, "ROCm fails with accuracy issue") @unittest.skipIf(not SM90OrLater, "FP8 is only supported on H100+") - @parametrize( - "float8_dtype", - (torch.float8_e4m3fn, torch.float8_e5m2) - if torch.version.hip is None - else (torch.float8_e4m3fnuz, torch.float8_e5m2fnuz), - ) + @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2)) @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096")) def test_amax_fp8_quant(self, float8_dtype: torch.dtype, shape: str): + float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda") shape = [int(dim) for dim in shape.split(",")] batch_size, sequence_length, hidden_size = shape @@ -258,14 +274,10 @@ class TestFP8Types(TestCase): torch.testing.assert_close(y_compiled.half(), y.half(), rtol=1e-2, atol=1e-2) @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) - @parametrize( - "float8_dtype", - (torch.float8_e4m3fn, torch.float8_e5m2) - if torch.version.hip is None - else (torch.float8_e4m3fnuz, torch.float8_e5m2fnuz), - ) + @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2)) @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096")) def test_amax_along_with_fp8_quant(self, float8_dtype: torch.dtype, shape: str): + float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda") shape = [int(dim) for dim in shape.split(",")] batch_size, sequence_length, hidden_size = shape @@ -293,17 +305,13 @@ class TestFP8Types(TestCase): @unittest.skipIf(TEST_WITH_ROCM, "ROCm fails with accuracy issue") @unittest.skipIf(not SM90OrLater, "FP8 is only supported on H100+") - @parametrize( - "float8_dtype", - (torch.float8_e4m3fn, torch.float8_e5m2) - if torch.version.hip is None - else (torch.float8_e4m3fnuz, torch.float8_e5m2fnuz), - ) + @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2)) @parametrize("amax_keep_dim", (True, False)) @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096")) def test_layernorm_fp8_quant( self, float8_dtype: torch.dtype, amax_keep_dim: bool, shape: str ): + float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda") shape = [int(dim) for dim in shape.split(",")] batch_size, sequence_length, hidden_size = shape @@ -339,12 +347,7 @@ class TestFP8Types(TestCase): ) @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) - @parametrize( - "float8_dtype", - (torch.float8_e4m3fn, torch.float8_e5m2) - if torch.version.hip is None - else (torch.float8_e4m3fnuz, torch.float8_e5m2fnuz), - ) + @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2)) @parametrize("shape", ("4,2048,4096",)) @parametrize("keepdim", (False, True)) def test_layernorm_fp8_quant_benchmark( @@ -353,6 +356,7 @@ class TestFP8Types(TestCase): shape: str, keepdim: bool, ): + float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda") shape = [int(dim) for dim in shape.split(",")] batch_size, sequence_length, hidden_size = shape diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py index 50cc0ef0303..4bd3b33c94d 100644 --- a/test/inductor/test_loop_ordering.py +++ b/test/inductor/test_loop_ordering.py @@ -19,6 +19,7 @@ from torch._inductor.test_operators import realize from torch._inductor.utils import sympy_index_symbol from torch._inductor.virtualized import ops, V from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8 +from torch.testing._internal.common_utils import skipIfRocm from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU from torch.utils._pytree import tree_map from torch.utils._sympy.functions import ModularIndexing @@ -398,6 +399,7 @@ class LoopOrderingTest(TestCase): self.do_acc_test(f, x) self.assertEqual(1, metrics.generated_kernel_count) + @skipIfRocm @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+") def test_fp8_cast_and_t(self): """ @@ -420,6 +422,7 @@ class LoopOrderingTest(TestCase): self.do_acc_test(f, x, scale) self.assertEqual(1, metrics.generated_kernel_count) + @skipIfRocm @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+") def test_fp8_pattern_2(self): """ diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py index a7d7ca2b18f..179276661dc 100644 --- a/test/inductor/test_mkldnn_pattern_matcher.py +++ b/test/inductor/test_mkldnn_pattern_matcher.py @@ -23,9 +23,11 @@ from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, IS_FBCODE, IS_LINUX, + MI300_ARCH, parametrize, skipIfNoXPU, skipIfRocm, + skipIfRocmArch, TEST_ACL, TEST_MKL, xfailIfACL, @@ -1032,6 +1034,7 @@ class TestPatternMatcher(TestPatternMatcherBase): @skipIfNoDynamoSupport @skipIfNoONEDNNBF16 @skipIfNoONEDNN + @skipIfRocmArch(MI300_ARCH) def test_qconv2d_int8_mixed_bf16(self): r""" This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization. diff --git a/test/test_cuda.py b/test/test_cuda.py index 3e4d66f2e4a..0929854d8c0 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -60,6 +60,7 @@ from torch.testing._internal.common_utils import ( IS_SANDCASTLE, IS_WINDOWS, load_tests, + MI300_ARCH, NO_MULTIPROCESSING_SPAWN, parametrize, run_tests, @@ -68,6 +69,7 @@ from torch.testing._internal.common_utils import ( skipCUDAMemoryLeakCheckIf, skipCUDANonDefaultStreamIf, skipIfRocm, + skipIfRocmArch, slowTest, subtest, TemporaryFileName, @@ -4055,10 +4057,12 @@ class TestCudaMallocAsync(TestCase): self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32) @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available") + @skipIfRocmArch(MI300_ARCH) def test_power_draw(self): self.assertTrue(torch.cuda.power_draw() >= 0) @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available") + @skipIfRocmArch(MI300_ARCH) def test_clock_speed(self): self.assertTrue(torch.cuda.clock_rate() >= 0) diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py index abd1b754dff..719c339c497 100644 --- a/test/test_matmul_cuda.py +++ b/test/test_matmul_cuda.py @@ -1,5 +1,6 @@ # Owner(s): ["module: linear algebra"] +import contextlib import unittest from itertools import product from functools import partial @@ -351,20 +352,20 @@ class TestFP8MatmulCuda(TestCase): @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) def test_float8_basics(self, device) -> None: self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16) - # hipblaslt does not yet support mixed e4m3_type input - if torch.version.hip is None: - self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32) - self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48) # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported - with self.assertRaises(RuntimeError): + # supported on ROCm but fails on CUDA + ctx = self.assertRaises(RuntimeError) if torch.version.hip is None else contextlib.nullcontext() + with ctx: self._test_tautological_mm(device, e5m2_type, e5m2_type) + self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32) + self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48) + self._test_tautological_mm(device, size=64, out_dtype=torch.float16) self._test_tautological_mm(device, size=96, out_dtype=torch.float32) - # hipblaslt does not yet support bfloat16 output - if torch.version.hip is None: - self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16) - with self.assertRaises(RuntimeError): + self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16) + + with self.assertRaises(AssertionError if torch.version.hip else RuntimeError): self._test_tautological_mm(device, out_dtype=e5m2_type) @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) diff --git a/test/test_torch.py b/test/test_torch.py index ff882391d5b..f9e848ace6e 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -35,9 +35,9 @@ from torch.testing._internal.common_optimizers import ( optim_db, optims, _get_optim_inputs_including_global_cliquey_kwargs) from torch.testing._internal.common_utils import ( # type: ignore[attr-defined] - TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, run_tests, IS_JETSON, + MI300_ARCH, TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, run_tests, IS_JETSON, IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN, - IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, skipIfTorchInductor, load_tests, slowTest, slowTestIf, + IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, skipIfRocmArch, skipIfTorchInductor, load_tests, slowTest, slowTestIf, skipIfCrossRef, TEST_WITH_CROSSREF, skipIfTorchDynamo, skipRocmIfTorchInductor, set_default_dtype, skipCUDAMemoryLeakCheckIf, BytesIOContext, skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName, @@ -1738,6 +1738,7 @@ else: 'embedding_bag_backward_cuda_max', torch.device(device).type == 'cuda') + @skipIfRocmArch(MI300_ARCH) @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707") @onlyCUDA def test_deterministic_cumsum(self, device): diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 7214fc09396..ab737a652eb 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1855,6 +1855,19 @@ def skipIfRocm(func=None, *, msg="test doesn't currently work on the ROCm stack" return dec_fn(func) return dec_fn +def skipIfRocmArch(arch: Tuple[str, ...]): + def dec_fn(fn): + @wraps(fn) + def wrap_fn(self, *args, **kwargs): + if TEST_WITH_ROCM: + prop = torch.cuda.get_device_properties(0) + if prop.gcnArchName.split(":")[0] in arch: + reason = f"skipIfRocm: test skipped on {arch}" + raise unittest.SkipTest(reason) + return fn(self, *args, **kwargs) + return wrap_fn + return dec_fn + def runOnRocm(fn): @wraps(fn) def wrapper(*args, **kwargs):