mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
**this is a re-submit of this PR, the previous version broke forked pull requests by checkout out the head ref as opposed to the head sha** There are two commits that we test sometimes in CI: 1. The merge commit (a test merge between the PR head ref and the latest base ref) 2. The head ref (the exact commit that was at the head of the user's branch when they pushed). This distinction is fairly subtle; in the case of 1, you are effectively running against a "rebased" version of your PR's branch. The problem is that we use *both* of these commits today, with confusing results—depending on how you put up your PR and what workflows are running, we might be testing two different commits! We should probably consolidate on one. This would eliminate a subtle but complex part of our CI (I am mildly horrified by the complexity of [this explanation](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#which-commit-is-used-in-ci), although it's heroic that someone went and documented it lol). This PR consolidates on using the head ref (option 2). - This is the behavior of phabricator/fbcode, which many PT devs will be more familiar with. - This is the behavior of ghstack - Our master branch moves quite quickly, so the chance that there is a substantial divergence between your local test runs and CI is high, with confusing results that are nondeterministic based on when you put up the PR. - We use a linear history/squash-rebase-merge workflow, which is better modeled by option 2. Option 1 effectively emulates a merge-commit-style workflow. The primary disadvantage is that now when re-running workflows, you will not be re-running against a "rebased" version of the PR, but the exact head ref that was pushed. Tbh I find it quite unintuitive that what you're testing changes depending on when you press the re-run button, but I know at least @malfet does this so it's worth mentioning. Pull Request resolved: https://github.com/pytorch/pytorch/pull/71974
239 lines
10 KiB
YAML
Generated
239 lines
10 KiB
YAML
Generated
# @generated DO NOT EDIT MANUALLY
|
|
# Template is at: .github/templates/linux_ci_workflow.yml.j2
|
|
# Generation script: .github/scripts/generate_ci_workflows.py
|
|
name: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7
|
|
|
|
on:
|
|
push:
|
|
tags:
|
|
- 'ciflow/all/*'
|
|
- 'ciflow/cuda/*'
|
|
- 'ciflow/libtorch/*'
|
|
- 'ciflow/linux/*'
|
|
- 'ciflow/scheduled/*'
|
|
schedule:
|
|
- cron: 45 0,4,8,12,16,20 * * *
|
|
workflow_dispatch:
|
|
|
|
env:
|
|
BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7
|
|
DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7
|
|
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
|
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
|
TORCH_CUDA_ARCH_LIST: 5.2
|
|
IN_CI: 1
|
|
IS_GHA: 1
|
|
# This is used for the phase of adding wheel tests only, will be removed once completed
|
|
IN_WHEEL_TEST: 1
|
|
# Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
|
|
CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
|
|
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
|
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
|
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
AWS_DEFAULT_REGION: us-east-1
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
PYTORCH_RETRY_TEST_CASES: 1
|
|
concurrency:
|
|
group: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
|
|
build:
|
|
runs-on: linux.2xlarge
|
|
timeout-minutes: 240
|
|
env:
|
|
JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7-build
|
|
outputs:
|
|
docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
|
|
steps:
|
|
- name: print labels
|
|
run: echo "${PR_LABELS}"
|
|
- name: Display EC2 information
|
|
shell: bash
|
|
run: |
|
|
set -euo pipefail
|
|
function get_ec2_metadata() {
|
|
# Pulled from instance metadata endpoint for EC2
|
|
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
|
category=$1
|
|
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
|
}
|
|
echo "ami-id: $(get_ec2_metadata ami-id)"
|
|
echo "instance-id: $(get_ec2_metadata instance-id)"
|
|
echo "instance-type: $(get_ec2_metadata instance-type)"
|
|
- name: Log in to ECR
|
|
env:
|
|
AWS_RETRY_MODE: standard
|
|
AWS_MAX_ATTEMPTS: 5
|
|
run: |
|
|
AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
|
|
retry () {
|
|
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
|
}
|
|
retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
|
|
--password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
|
|
- name: Chown workspace
|
|
run: |
|
|
retry () {
|
|
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
|
}
|
|
retry docker pull "${ALPINE_IMAGE}"
|
|
# Ensure the working directory gets chowned back to the current user
|
|
docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
|
- name: Clean workspace
|
|
run: |
|
|
rm -rf "${GITHUB_WORKSPACE}"
|
|
mkdir "${GITHUB_WORKSPACE}"
|
|
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
|
uses: seemethere/add-github-ssh-key@v1
|
|
with:
|
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
- name: Preserve github env variables for use in docker
|
|
run: |
|
|
env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
|
|
- name: Checkout PyTorch
|
|
uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
|
|
with:
|
|
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
|
# deep clone, to allow use of git merge-base
|
|
fetch-depth: 0
|
|
submodules: recursive
|
|
- name: Clean PyTorch checkout
|
|
run: |
|
|
# Remove any artifacts from the previous checkouts
|
|
git clean -fxd
|
|
- name: Calculate docker image tag
|
|
id: calculate-tag
|
|
run: |
|
|
DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
|
|
echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
|
|
echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
|
|
echo "::set-output name=docker_tag::${DOCKER_TAG}"
|
|
echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
|
|
- name: Check if image should be built
|
|
id: check
|
|
env:
|
|
BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
|
|
run: |
|
|
set -x
|
|
# Check if image already exists, if it does then skip building it
|
|
if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
|
|
exit 0
|
|
fi
|
|
if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
|
|
# if we're on the base branch then use the parent commit
|
|
MERGE_BASE=$(git rev-parse HEAD~)
|
|
else
|
|
# otherwise we're on a PR, so use the most recent base commit
|
|
MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
|
|
fi
|
|
# Covers the case where a previous tag doesn't exist for the tree
|
|
# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
|
|
if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
|
|
echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
|
|
exit 1
|
|
fi
|
|
PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
|
|
# If no image exists but the hash is the same as the previous hash then we should error out here
|
|
if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
|
|
echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
|
|
echo " contact the PyTorch team to restore the original images"
|
|
exit 1
|
|
fi
|
|
echo ::set-output name=rebuild::yes
|
|
- name: Build and push docker image
|
|
if: ${{ steps.check.outputs.rebuild }}
|
|
env:
|
|
DOCKER_SKIP_S3_UPLOAD: 1
|
|
working-directory: .circleci/docker
|
|
run: |
|
|
export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
|
|
./build_docker.sh
|
|
- name: Pull Docker image
|
|
run: |
|
|
retry () {
|
|
"$@" || (sleep 1 && "$@") || (sleep 2 && "$@")
|
|
}
|
|
retry docker pull "${DOCKER_IMAGE}"
|
|
- name: Parse ref
|
|
id: parse-ref
|
|
run: .github/scripts/parse_ref.py
|
|
- name: Build
|
|
env:
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
run: |
|
|
# detached container should get cleaned up by teardown_ec2_linux
|
|
container_name=$(docker run \
|
|
-e BUILD_ENVIRONMENT \
|
|
-e JOB_BASE_NAME \
|
|
-e MAX_JOBS="$(nproc --ignore=2)" \
|
|
-e AWS_DEFAULT_REGION \
|
|
-e IS_GHA \
|
|
-e PR_NUMBER \
|
|
-e SHA1 \
|
|
-e BRANCH \
|
|
-e GITHUB_RUN_ID \
|
|
-e SCCACHE_BUCKET \
|
|
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
|
|
-e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
|
|
-e SKIP_SCCACHE_INITIALIZATION=1 \
|
|
-e TORCH_CUDA_ARCH_LIST \
|
|
-e PR_LABELS \
|
|
-e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
|
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
|
--security-opt seccomp=unconfined \
|
|
--cap-add=SYS_PTRACE \
|
|
--tty \
|
|
--detach \
|
|
--user jenkins \
|
|
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
|
|
-w /var/lib/jenkins/workspace \
|
|
"${DOCKER_IMAGE}"
|
|
)
|
|
docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
|
|
- name: Display and upload binary build size statistics (Click Me)
|
|
# temporary hack: set CIRCLE_* vars, until we update
|
|
# tools/stats/print_test_stats.py to natively support GitHub Actions
|
|
env:
|
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
|
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
|
TAG: ${{ steps.parse-ref.outputs.tag }}
|
|
WORKFLOW_ID: '${{ github.run_id }}'
|
|
run: |
|
|
COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
|
|
export COMMIT_TIME
|
|
pip3 install requests==2.26 boto3==1.16.34
|
|
python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
|
|
- name: Chown workspace
|
|
run: |
|
|
# Ensure the working directory gets chowned back to the current user
|
|
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
|
- name: Hold runner for 2 hours or until ssh sessions have drained
|
|
# Always hold for active ssh sessions
|
|
if: always()
|
|
run: .github/scripts/wait_for_ssh_to_drain.sh
|
|
- name: Chown workspace
|
|
if: always()
|
|
run: |
|
|
# Ensure the working directory gets chowned back to the current user
|
|
docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
|
|
- name: Kill containers, clean up images
|
|
if: always()
|
|
run: |
|
|
# ignore expansion of "docker ps -q" since it could be empty
|
|
# shellcheck disable=SC2046
|
|
docker stop $(docker ps -q) || true
|
|
# Prune all of the docker images
|
|
docker system prune -af
|
|
- name: Hold runner for 2 hours or until ssh sessions have drained
|
|
# Always hold for active ssh sessions
|
|
if: always()
|
|
run: .github/scripts/wait_for_ssh_to_drain.sh
|
|
- name: Clean up docker images
|
|
if: always()
|
|
run: |
|
|
# Prune all of the docker images
|
|
docker system prune -af
|