mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[ROCm][CI] Update rocm.yml workflow to use 1 GPU ARC runners (#165481)
* Moving rocm.yml from using persistent non-ARC runners from the combined MI2xx (MI210 + MI250) cluster to the ARC runners from the MI250 cluster. This halves the number of nodes, but provides access to approximately 4 times the runners, since every 8-GPU MI250 node now provides 8 1-GPU runners. This should help with concurrent capacity and queueing on the MI2xx jobs. Tested here successfully: https://github.com/pytorch/pytorch/actions/runs/18620814622/job/53092469720 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165481 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony, https://github.com/albanD Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
This commit is contained in:
parent
ab82456c16
commit
8700d68fef
13
.github/actionlint.yaml
vendored
13
.github/actionlint.yaml
vendored
|
|
@ -1,4 +1,4 @@
|
|||
self-hosted-runner:
|
||||
runnerssted-runner:
|
||||
labels:
|
||||
# GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
|
||||
- ubuntu-24.04
|
||||
|
|
@ -54,12 +54,17 @@ self-hosted-runner:
|
|||
- windows-11-arm64
|
||||
- windows-11-arm64-preview
|
||||
# Organization-wide AMD-hosted runners
|
||||
# MI2xx runners
|
||||
# MI2xx non-ARC runners
|
||||
- linux.rocm.gpu
|
||||
- linux.rocm.gpu.mi250
|
||||
- linux.rocm.gpu.2
|
||||
- linux.rocm.gpu.4
|
||||
# gfx942 runners
|
||||
- linux.rocm.gpu.mi250
|
||||
- linux.rocm.gpu.gfx1100
|
||||
# MI2xx ARC runners
|
||||
- linux.rocm.gpu.mi250.1
|
||||
- linux.rocm.gpu.mi250.2
|
||||
- linux.rocm.gpu.mi250.4
|
||||
# gfx942 ARC runners
|
||||
- linux.rocm.gpu.gfx942.1
|
||||
- linux.rocm.gpu.gfx942.2
|
||||
- linux.rocm.gpu.gfx942.4
|
||||
|
|
|
|||
12
.github/workflows/rocm.yml
vendored
12
.github/workflows/rocm.yml
vendored
|
|
@ -36,12 +36,12 @@ jobs:
|
|||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
|
||||
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
|
||||
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
|
||||
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
|
||||
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
|
||||
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
|
||||
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user