mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Add new GHA workflow to cache ROCm CI docker images on MI300 CI runners periodically (#148394)
Refiling https://github.com/pytorch/pytorch/pull/148387 from pytorch repo branch to get AWS login via OIDC working Successful docker caching run: https://github.com/pytorch/pytorch/actions/runs/13843689908/job/38737095535 Run without cached docker image: https://github.com/pytorch/pytorch/actions/runs/13843692637/job/38746033460  Run with cached docker image:  ~6 min vs 3 s :) Thanks @saienduri for the help on the MI300 infra side Pull Request resolved: https://github.com/pytorch/pytorch/pull/148394 Approved by: https://github.com/jeffdaily
This commit is contained in:
parent
9ad6265d04
commit
1c7196f04b
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
|
|
@ -52,6 +52,7 @@ self-hosted-runner:
|
|||
- linux.rocm.gpu
|
||||
- linux.rocm.gpu.2
|
||||
- linux.rocm.gpu.4
|
||||
- rocm-docker
|
||||
# Repo-specific Apple hosted runners
|
||||
- macos-m1-ultra
|
||||
- macos-m2-14
|
||||
|
|
|
|||
55
.github/workflows/docker-cache-mi300.yml
vendored
Normal file
55
.github/workflows/docker-cache-mi300.yml
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
name: docker-cache-mi300
|
||||
|
||||
on:
|
||||
# run every 6 hours
|
||||
schedule:
|
||||
- cron: 0 0,6,12,18 * * *
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
docker-cache:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: rocm-docker
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
no-sudo: true
|
||||
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
continue-on-error: false
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||
push: false
|
||||
|
||||
- name: Pull docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
|
||||
- name: Tar and upload to S3 bucket
|
||||
run: |
|
||||
sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
|
||||
Loading…
Reference in New Issue
Block a user