Add new GHA workflow to cache ROCm CI docker images on MI300 CI runners periodically (#148394)

Refiling https://github.com/pytorch/pytorch/pull/148387 from pytorch repo branch to get AWS login via OIDC working

Successful docker caching run: https://github.com/pytorch/pytorch/actions/runs/13843689908/job/38737095535
Run without cached docker image: https://github.com/pytorch/pytorch/actions/runs/13843692637/job/38746033460
![image](https://github.com/user-attachments/assets/c410ff35-a150-4885-b904-3a5e1888c032)
Run with cached docker image:
![image](https://github.com/user-attachments/assets/41e417b5-a795-4ed2-a9cd-00151db8f813)
~6 min vs 3 s :)

Thanks @saienduri for the help on the MI300 infra side

Pull Request resolved: https://github.com/pytorch/pytorch/pull/148394
Approved by: https://github.com/jeffdaily
This commit is contained in:
Jithun Nair 2025-03-15 00:34:00 +00:00 committed by PyTorch MergeBot
parent 9ad6265d04
commit 1c7196f04b
2 changed files with 56 additions and 0 deletions

View File

@ -52,6 +52,7 @@ self-hosted-runner:
- linux.rocm.gpu
- linux.rocm.gpu.2
- linux.rocm.gpu.4
- rocm-docker
# Repo-specific Apple hosted runners
- macos-m1-ultra
- macos-m2-14

View File

@ -0,0 +1,55 @@
name: docker-cache-mi300
on:
# run every 6 hours
schedule:
- cron: 0 0,6,12,18 * * *
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
docker-cache:
if: github.repository_owner == 'pytorch'
runs-on: rocm-docker
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Login to Amazon ECR
id: login-ecr
continue-on-error: false
uses: aws-actions/amazon-ecr-login@v2
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: pytorch-linux-focal-rocm-n-py3
push: false
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Tar and upload to S3 bucket
run: |
sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress