[ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481)

* Moving rocm.yml from using persistent non-ARC runners from the combined MI2xx (MI210 + MI250) cluster to the ARC runners from the MI250 cluster. This halves the number of nodes, but provides access to approximately 4 times the runners, since every 8-GPU MI250 node now provides 8 1-GPU runners. This should help with concurrent capacity and queueing on the MI2xx jobs.

Tested here successfully: https://github.com/pytorch/pytorch/actions/runs/18620814622/job/53092469720

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165481
Approved by: https://github.com/jeffdaily

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
This commit is contained in:
amdfaa 2025-10-21 04:02:04 +00:00 committed by PyTorch MergeBot
parent 0e083942cc
commit ffa90d46e6
2 changed files with 14 additions and 9 deletions

View File

@ -54,12 +54,17 @@ self-hosted-runner:
- windows-11-arm64
- windows-11-arm64-preview
# Organization-wide AMD-hosted runners
# MI2xx runners
# MI2xx non-ARC runners
- linux.rocm.gpu
- linux.rocm.gpu.mi250
- linux.rocm.gpu.2
- linux.rocm.gpu.4
# gfx942 runners
- linux.rocm.gpu.mi250
- linux.rocm.gpu.gfx1100
# MI2xx ARC runners
- linux.rocm.gpu.mi250.1
- linux.rocm.gpu.mi250.2
- linux.rocm.gpu.mi250.4
# gfx942 ARC runners
- linux.rocm.gpu.gfx942.1
- linux.rocm.gpu.gfx942.2
- linux.rocm.gpu.gfx942.4

View File

@ -36,12 +36,12 @@ jobs:
sync-tag: rocm-build
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
]}
secrets: inherit