From 53d68b95dee766465d2dd9023f6d353c72b979ef Mon Sep 17 00:00:00 2001 From: saienduri Date: Wed, 30 Jul 2025 19:47:55 +0000 Subject: [PATCH] [ROCm CI] Migrate to MI325 Capacity. (#159059) This PR moves PyTorch CI capacity from mi300 to a new, larger mi325 cluster. Both of these GPUs are the same architecture gfx942 and our testing plans don't change within an architecture, so we pool them under the same label `linux.rocm.gpu.gfx942.<#gpus>` with this PR as well to reduce overhead and confusion. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159059 Approved by: https://github.com/jithunnair-amd, https://github.com/atalman Co-authored-by: deedongala --- .github/actionlint.yaml | 6 ++-- .github/workflows/_rocm-test.yml | 4 +-- .../inductor-perf-test-nightly-rocm.yml | 34 +++++++++---------- .github/workflows/inductor-rocm-mi300.yml | 4 +-- .github/workflows/periodic-rocm-mi300.yml | 6 ++-- .github/workflows/rocm-mi300.yml | 12 +++---- test/inductor/test_aot_inductor.py | 8 +++++ 7 files changed, 41 insertions(+), 33 deletions(-) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 4f69b2ae9fb..647671e8c83 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -53,9 +53,9 @@ self-hosted-runner: - linux.rocm.gpu.mi250 - linux.rocm.gpu.2 - linux.rocm.gpu.4 - # MI300 runners - - linux.rocm.gpu.mi300.2 - - linux.rocm.gpu.mi300.4 + # gfx942 runners + - linux.rocm.gpu.gfx942.2 + - linux.rocm.gpu.gfx942.4 - rocm-docker # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors) - macos-m1-stable diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 90bf41fa219..2d660d98905 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -269,8 +269,8 @@ jobs: # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" - - name: Change permissions (only needed for MI300 and MI355 kubernetes runners for now) - if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'mi300') || contains(matrix.runner, 'mi355')) }} + - name: Change permissions (only needed for kubernetes runners for now) + if: ${{ always() && steps.test.conclusion && (contains(matrix.runner, 'gfx942') || contains(matrix.runner, 'mi355')) }} run: | docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml index 389a1c0fc07..377f6d04bc8 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml @@ -88,23 +88,23 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ - { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" }, ]} secrets: inherit diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index e5c7ad76f2b..f4c81ce7d7b 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -47,8 +47,8 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" }, + { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" }, ]} secrets: inherit diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml index 7eafa9d598e..4d8890e69fc 100644 --- a/.github/workflows/periodic-rocm-mi300.yml +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -59,9 +59,9 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ - { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, - { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, - { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.gfx942.4", owners: ["module:rocm", "oncall:distributed"] }, ]} secrets: inherit diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index 489d2941a7f..c51d89e5c95 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -48,12 +48,12 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" }, + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" }, ]} secrets: inherit diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py index 05b3926b29e..b7157909dbd 100644 --- a/test/inductor/test_aot_inductor.py +++ b/test/inductor/test_aot_inductor.py @@ -57,9 +57,11 @@ from torch.testing._internal.common_utils import ( IS_MACOS, IS_WINDOWS, MACOS_VERSION, + MI300_ARCH, parametrize, skipIfMPS, skipIfRocm, + skipIfRocmArch, skipIfWindows, skipIfXpu, TEST_MPS, @@ -149,8 +151,11 @@ except (unittest.SkipTest, ImportError): class AOTInductorTestsTemplate: + # Temporarily skipping test as pytorch/cpuinfo not able to retrieve cache size for + # AMD EPYC 9575F 64-Core Processor CPU in gfx942 VM Runners @common_utils.parametrize("embed_kernel_binary", [False, True]) @common_utils.parametrize("max_autotune", [False, True]) + @skipIfRocmArch(MI300_ARCH) def test_simple(self, embed_kernel_binary, max_autotune): if self.device == "cpu" and IS_MACOS and max_autotune: raise unittest.SkipTest("max_autotune not supported on macos") @@ -4843,7 +4848,10 @@ class AOTInductorTestsTemplate: ) self.assertTrue(same(model(*example_input), actual)) + # Temporarily skipping test as pytorch/cpuinfo not able to retrieve cache size for + # AMD EPYC 9575F 64-Core Processor CPU in gfx942 VM Runners @common_utils.parametrize("max_autotune", [True, False]) + @skipIfRocmArch(MI300_ARCH) def test_misc_1(self, max_autotune): if self.device == "cpu" and IS_MACOS and max_autotune: raise unittest.SkipTest("max_autotune not supported on macos")