From 8700d68fef855850e2e0aa65056a77b8f80adbdb Mon Sep 17 00:00:00 2001 From: amdfaa <107946068+amdfaa@users.noreply.github.com> Date: Mon, 20 Oct 2025 16:06:37 +0000 Subject: [PATCH] [ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481) * Moving rocm.yml from using persistent non-ARC runners from the combined MI2xx (MI210 + MI250) cluster to the ARC runners from the MI250 cluster. This halves the number of nodes, but provides access to approximately 4 times the runners, since every 8-GPU MI250 node now provides 8 1-GPU runners. This should help with concurrent capacity and queueing on the MI2xx jobs. Tested here successfully: https://github.com/pytorch/pytorch/actions/runs/18620814622/job/53092469720 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165481 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony, https://github.com/albanD Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> --- .github/actionlint.yaml | 13 +++++++++---- .github/workflows/rocm.yml | 12 ++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index d021371ca88..3a3d2a2e699 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -1,4 +1,4 @@ -self-hosted-runner: +runnerssted-runner: labels: # GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old - ubuntu-24.04 @@ -54,12 +54,17 @@ self-hosted-runner: - windows-11-arm64 - windows-11-arm64-preview # Organization-wide AMD-hosted runners - # MI2xx runners + # MI2xx non-ARC runners - linux.rocm.gpu - - linux.rocm.gpu.mi250 - linux.rocm.gpu.2 - linux.rocm.gpu.4 - # gfx942 runners + - linux.rocm.gpu.mi250 + - linux.rocm.gpu.gfx1100 + # MI2xx ARC runners + - linux.rocm.gpu.mi250.1 + - linux.rocm.gpu.mi250.2 + - linux.rocm.gpu.mi250.4 + # gfx942 ARC runners - linux.rocm.gpu.gfx942.1 - linux.rocm.gpu.gfx942.2 - linux.rocm.gpu.gfx942.4 diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index e6bc43f34f8..36cc5fd0e8e 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -36,12 +36,12 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" }, + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, ]} secrets: inherit