[CI] Reuse old whl (#153838)

~50% of commits on main only touch python files unrelated to the object files in the whl, meaning that we could reuse old whls and put the current commit's python files into the whl.  This PR does that in CI by identifying a previous job whose artifact and whls binaries can be reused.  See https://docs.google.com/document/d/1nQ1FNJqnJuSFRiM2HvQ27zg6Vm-77n7LECp30zYfTDk/edit?tab=t.icom2lesr6es for more details?

To reuse:
* the changed files between the whl's commit and the current commit can only be python files in test/ or torch/ and not in torch/csrc
* not on main branch or release branch
* ci-force-rebuild not on PR
* special abort issue is closed
* artifact should exist

Pros:
* build time -> 6 min whenever this can be done

Cons:
* not sure if I have the right files
* version + whl name still remains the same

Testing:
Unfortunately this PR's changed files are not on the list of acceptable changed files for reusing the whl, so I've been mangling it on other PRs to get things like https://github.com/pytorch/pytorch/actions/runs/15119214901/job/42497650394?pr=147470 (It is enabled on linux-focal-cuda12.6-py3.10-gcc11 / build and there are changes in common_utils.py to make sure the copying of python takes effect)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/153838
Approved by: https://github.com/malfet
This commit is contained in:
Catherine Lee 2025-05-19 20:56:44 +00:00 committed by PyTorch MergeBot
parent 9180bb187c
commit cc48550e6f
4 changed files with 344 additions and 7 deletions

View File

@ -0,0 +1,38 @@
name: Reuse old wheel if possible
description:
Reuse old wheel if possible
inputs:
build-environment:
description: Build environment
required: true
run-id:
description: Workflow run ID
required: true
github-token:
description: GitHub token
required: true
outputs:
reuse:
description: Whether the wheel is reused or not
value: ${{ steps.check-file-changes.outputs.reuse }}
runs:
using: composite
steps:
# Check out pytorch with fetch depth 0
- name: Check file changes
id: check-file-changes
shell: bash
continue-on-error: true
env:
GITHUB_TOKEN: ${{ inputs.github-token }}
run: |
set -x
python3 ${GITHUB_ACTION_PATH}/reuse_old_whl.py \
--build-environment "${{ inputs.build-environment }}" \
--run-id "${{ inputs.run-id }}" \
--github-ref "${{ github.ref }}"

View File

@ -0,0 +1,280 @@
import argparse
import os
import subprocess
from functools import lru_cache
from pathlib import Path
from typing import Any, cast, Optional
import requests
FORCE_REBUILD_LABEL = "ci-force-rebuild"
@lru_cache
def get_merge_base() -> str:
merge_base = subprocess.check_output(
["git", "merge-base", "HEAD", "origin/main"],
text=True,
stderr=subprocess.DEVNULL,
).strip()
return merge_base
@lru_cache
def get_head_sha() -> str:
sha = subprocess.check_output(
["git", "rev-parse", "HEAD"],
text=True,
stderr=subprocess.DEVNULL,
).strip()
return sha
def is_main_branch() -> bool:
return False
# Testing on main branch for now
# print(
# f"Checking if we are on main branch: merge base {get_merge_base()}, head {get_head_sha()}"
# )
# return get_merge_base() == get_head_sha()
def query_github_api(url: str) -> Any:
headers = {
"Accept": "application/vnd.github.v3+json",
"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
}
response = requests.get(url, headers=headers)
return response.json()
@lru_cache
def check_labels_for_pr() -> bool:
# Check if the current commit is part of a PR and if it has the
# FORCE_REBUILD_LABEL
head_sha = get_head_sha()
url = f"https://api.github.com/repos/pytorch/pytorch/commits/{head_sha}/pulls"
response = query_github_api(url)
print(
f"Found {len(response)} PRs for commit {head_sha}: {[pr['number'] for pr in response]}"
)
for pr in response:
labels = pr.get("labels", [])
for label in labels:
if label["name"] == FORCE_REBUILD_LABEL:
print(f"Found label {FORCE_REBUILD_LABEL} in PR {pr['number']}.")
return True
return False
def check_issue_open() -> bool:
# Check if issue #153759 is open. This is the config issue for quickly
# forcing everyone to build
url = "https://api.github.com/repos/pytorch/pytorch/issues/153759"
response = query_github_api(url)
if response.get("state") == "open":
print("Issue #153759 is open.")
return True
else:
print("Issue #153759 is not open.")
return False
def get_workflow_id(run_id: str) -> Optional[str]:
# Get the workflow ID that corresponds to the file for the run ID
url = f"https://api.github.com/repos/pytorch/pytorch/actions/runs/{run_id}"
response = query_github_api(url)
if "workflow_id" in response:
print(f"Found workflow ID for run ID {run_id}: {response['workflow_id']}")
return cast(str, response["workflow_id"])
else:
print("No workflow ID found.")
return None
def ok_changed_file(file: str) -> bool:
# Return true if the file is in the list of allowed files to be changed to
# reuse the old whl
if (
file.startswith("torch/")
and file.endswith(".py")
and not file.startswith("torch/csrc/")
):
return True
if file.startswith("test/") and file.endswith(".py"):
return True
return False
def check_changed_files(sha: str) -> bool:
# Return true if all the changed files are in the list of allowed files to
# be changed to reuse the old whl
changed_files = (
subprocess.check_output(
["git", "diff", "--name-only", sha, "HEAD"],
text=True,
stderr=subprocess.DEVNULL,
)
.strip()
.split()
)
print(f"Checking changed files between {sha} and HEAD:")
for file in changed_files:
if not ok_changed_file(file):
print(f" File {file} is not allowed to be changed.")
return False
else:
print(f" File {file} is allowed to be changed.")
return True
def find_old_whl(workflow_id: str, build_environment: str, sha: str) -> bool:
# Find the old whl on s3 and download it to artifacts.zip
if build_environment is None:
print("BUILD_ENVIRONMENT is not set.")
return False
print(f"SHA: {sha}, workflow_id: {workflow_id}")
workflow_runs = query_github_api(
f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{workflow_id}/runs?head_sha={sha}&branch=main&per_page=100"
)
if workflow_runs.get("total_count", 0) == 0:
print("No workflow runs found.")
return False
for run in workflow_runs.get("workflow_runs", []):
# Look in s3 for the old whl
run_id = run["id"]
try:
url = f"https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/{run_id}/{build_environment}/artifacts.zip"
print(f"Checking for old whl at {url}")
response = requests.get(
url,
)
if response.status_code == 200:
with open("artifacts.zip", "wb") as f:
f.write(response.content)
print(f"Found old whl file from s3: {url}")
return True
except requests.RequestException as e:
print(f"Error checking for old whl: {e}")
continue
return False
def unzip_artifact_and_replace_files() -> None:
# Unzip the artifact and replace files
subprocess.check_output(
["unzip", "-o", "artifacts.zip", "-d", "artifacts"],
)
os.remove("artifacts.zip")
# Rename wheel into zip
wheel_path = Path("artifacts/dist").glob("*.whl")
for path in wheel_path:
new_path = path.with_suffix(".zip")
os.rename(path, new_path)
print(f"Renamed {path} to {new_path}")
print(new_path.stem)
# Unzip the wheel
subprocess.check_output(
["unzip", "-o", new_path, "-d", f"artifacts/dist/{new_path.stem}"],
)
# Copy python files into the artifact
subprocess.check_output(
["rsync", "-avz", "torch", f"artifacts/dist/{new_path.stem}"],
)
# Zip the wheel back
subprocess.check_output(
["zip", "-r", f"{new_path.stem}.zip", "."],
cwd=f"artifacts/dist/{new_path.stem}",
)
subprocess.check_output(
[
"mv",
f"artifacts/dist/{new_path.stem}/{new_path.stem}.zip",
f"artifacts/dist/{new_path.stem}.whl",
],
)
# Remove the extracted folder
subprocess.check_output(
["rm", "-rf", f"artifacts/dist/{new_path.stem}"],
)
# Rezip the artifact
subprocess.check_output(["zip", "-r", "artifacts.zip", "."], cwd="artifacts")
subprocess.check_output(
["mv", "artifacts/artifacts.zip", "."],
)
return None
def set_output() -> None:
# Disable for now so we can monitor first
# pass
if os.getenv("GITHUB_OUTPUT"):
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
print("reuse=true", file=env)
else:
print("::set-output name=reuse::true")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Check for old whl files.")
parser.add_argument("--run-id", type=str, required=True, help="Workflow ID")
parser.add_argument(
"--build-environment", type=str, required=True, help="Build environment"
)
parser.add_argument(
"--github-ref",
type=str,
)
return parser.parse_args()
def can_reuse_whl(args: argparse.Namespace) -> bool:
# if is_main_branch() or (
# args.github_ref
# and any(
# args.github_ref.startswith(x)
# for x in ["refs/heads/release", "refs/tags/v", "refs/heads/main"]
# )
# ):
# print("On main branch or release branch, rebuild whl")
# return False
if check_labels_for_pr():
print(f"Found {FORCE_REBUILD_LABEL} label on PR, rebuild whl")
return False
if check_issue_open():
print("Issue #153759 is open, rebuild whl")
return False
if not check_changed_files(get_merge_base()):
print("Cannot use old whl due to the changed files, rebuild whl")
return False
workflow_id = get_workflow_id(args.run_id)
if workflow_id is None:
print("No workflow ID found, rebuild whl")
return False
if not find_old_whl(workflow_id, args.build_environment, get_merge_base()):
print("No old whl found, rebuild whl")
# TODO: go backwards from merge base to find more runs
return False
return True
if __name__ == "__main__":
args = parse_args()
if can_reuse_whl(args):
print("Reusing old whl")
unzip_artifact_and_replace_files()
set_output()

View File

@ -93,6 +93,14 @@ on:
type: number
default: 1
allow-reuse-old-whl:
description: |
If set, the build try to pull an old wheel from s3 that was built on a
commit with no cpp changes from this commit
required: false
type: boolean
default: false
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
@ -150,6 +158,15 @@ jobs:
role-session-name: gha-linux-build
aws-region: us-east-1
- name: Check if can use old whl build
id: use-old-whl
uses: ./.github/actions/reuse-old-whl
if: ${{ inputs.allow-reuse-old-whl && github.event_name == 'push' }}
with:
build-environment: ${{ inputs.build-environment }}
run-id: ${{ github.run_id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -159,7 +176,7 @@ jobs:
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
@ -169,7 +186,7 @@ jobs:
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -218,14 +235,14 @@ jobs:
- name: Download pytest cache
uses: ./.github/actions/pytest-cache-download
continue-on-error: true
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
with:
cache_dir: .pytest_cache
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
s3_bucket: ${{ inputs.s3-bucket }}
- name: Build
if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && steps.use-old-whl.outputs.reuse != 'true'
id: build
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
@ -329,13 +346,13 @@ jobs:
kill "$MONITOR_SCRIPT_PID"
- name: Archive artifacts into zip
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && steps.use-old-whl.outputs.reuse != 'true'
run: |
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
- name: Store PyTorch Build Artifacts on S3
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
name: ${{ inputs.build-environment }}
retention-days: 14
@ -345,7 +362,7 @@ jobs:
- name: Store PyTorch Build Artifacts for s390x
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel'
if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment == 'linux-s390x-binary-manywheel'
with:
name: ${{ inputs.build-environment }}
retention-days: 14

View File

@ -296,6 +296,7 @@ jobs:
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
]}
allow-reuse-old-whl: true
secrets: inherit
linux-focal-cuda12_6-py3_10-gcc11-test:
@ -469,6 +470,7 @@ jobs:
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
]}
allow-reuse-old-whl: true
secrets: inherit
linux-focal-cuda12_6-py3_10-gcc11-sm89-test: