diff --git a/.github/scripts/generate_linux_ci_workflows.py b/.github/scripts/generate_linux_ci_workflows.py new file mode 100755 index 00000000000..135034f24b0 --- /dev/null +++ b/.github/scripts/generate_linux_ci_workflows.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python + +from pathlib import Path + +import jinja2 + +DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com" + +GITHUB_DIR = Path(__file__).parent.parent + +CPU_TEST_RUNNER = "linux.2xlarge" +CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu" + + +class PyTorchLinuxWorkflow: + def __init__(self, build_environment: str, docker_image_base: str): + self.build_environment = build_environment + self.docker_image_base = docker_image_base + self.test_runner_type = CPU_TEST_RUNNER + if "cuda" in build_environment: + self.test_runner_type = CUDA_TEST_RUNNER + + def generate_workflow_file( + self, workflow_template: jinja2.Template, jinja_env: jinja2.Environment + ) -> Path: + output_file_path = GITHUB_DIR.joinpath( + f"workflows/{self.build_environment}.yml" + ) + with open(output_file_path, "w") as output_file: + output_file.write( + workflow_template.render( + build_environment=self.build_environment, + docker_image_base=self.docker_image_base, + test_runner_type=self.test_runner_type + ) + ) + output_file.write('\n') + return output_file_path + + +WORKFLOWS = [ + PyTorchLinuxWorkflow( + build_environment="pytorch-linux-xenial-py3.6-gcc5.4", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-paralleltbb-linux-xenial-py3.6-gcc5.4", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-parallelnative-linux-xenial-py3.6-gcc5.4", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-pure_torch-linux-xenial-py3.6-gcc5.4", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3.6-gcc7", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-asan", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang7-onnx", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-libtorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-bionic-py3.6-clang9-noarch", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-xla-linux-bionic-py3.6-clang9", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-vulkan-linux-bionic-py3.6-clang9", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.6-clang9", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-bionic-py3.8-gcc9-coverage", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.8-gcc9", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-bionic-rocm3.9-py3.6", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm3.9-py3.6", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-mobile", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-dynamic", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-mobile-custom-static", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-mobile-code-analysis", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), + # PyTorchLinuxWorkflow( + # build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a", + # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + # ), +] + +if __name__ == "__main__": + jinja_env = jinja2.Environment( + variable_start_string="!{{", + loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))), + ) + workflow_template = jinja_env.get_template("linux_ci_workflow.yml.in") + for workflow in WORKFLOWS: + print( + workflow.generate_workflow_file( + workflow_template=workflow_template, + jinja_env=jinja_env + ) + ) diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh new file mode 100755 index 00000000000..69337e08dac --- /dev/null +++ b/.github/scripts/install_nvidia_utils_linux.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -eou pipefail + +DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \ +DRIVER_FN="NVIDIA-Linux-x86_64-460.39.run" +YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" + +install_nvidia_docker2_amzn2() { + ( + set -x + # Needed for yum-config-manager + sudo yum install -y yum-utils + sudo yum-config-manager --add-repo "${YUM_REPO_URL}" + sudo yum install -y nvidia-docker2 + sudo systemctl restart docker + ) +} + +install_nvidia_driver() { + ( + set -x + sudo yum groupinstall -y "Development Tools" + curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" + sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) + nvidia-smi + ) +} + +# Install container toolkit based on distribution +echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" +case "${DISTRIBUTION}" in + amzn*) + install_nvidia_docker2_amzn2 + ;; + *) + echo "ERROR: Unknown distribution ${DISTRIBUTION}" + exit 1 + ;; +esac + +echo "== Installing nvidia driver ${DRIVER_FN} ==" +install_nvidia_driver diff --git a/.github/scripts/report_git_status.sh b/.github/scripts/report_git_status.sh new file mode 100755 index 00000000000..357bacfecb2 --- /dev/null +++ b/.github/scripts/report_git_status.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +CHANGES=$(git status --porcelain) +echo "$CHANGES" +git diff +[ -z "$CHANGES" ] diff --git a/.github/templates/linux_ci_workflow.yml.in b/.github/templates/linux_ci_workflow.yml.in new file mode 100644 index 00000000000..a816af1d410 --- /dev/null +++ b/.github/templates/linux_ci_workflow.yml.in @@ -0,0 +1,174 @@ +# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually +# +# Template is at: .github/templates/linux_ci_workflow.yml +# Generation script: .github/scripts/generate_linux_ci_workflows.py +name: Linux CI (!{{ build_environment }}) + +on: + # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + # pull_request: + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: !{{ build_environment }} + DOCKER_IMAGE_BASE: !{{ docker_image_base }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + +jobs: + calculate-docker-image: + runs-on: ubuntu-18.04 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Checkout PyTorch + uses: actions/checkout@v2 + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + build: + runs-on: linux.2xlarge + needs: calculate-docker-image + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + steps: + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + submodules: recursive + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch + run: | + SCCACHE_MAX_JOBS=$(( $(nproc) - 1 )) + MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM + export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS )) + docker run \ + -e BUILD_ENVIRONMENT \ + -e MAX_JOBS \ + -e SCCACHE_BUCKET \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -q -r artifacts.zip dist build + - uses: actions/upload-artifact@v2 + name: Store PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 30 + if-no-files-found: error + path: + artifacts.zip + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + test: + runs-on: !{{ test_runner_type }} + needs: + - calculate-docker-image + - build + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + steps: + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + - name: Checkout PyTorch + uses: actions/checkout@v2 + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: actions/download-artifact@v2 + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -q artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Test PyTorch + run: | + SCCACHE_MAX_JOBS=$(( $(nproc) - 1 )) + MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM + export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS )) + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086 + docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e IN_CI \ + -e MAX_JOBS \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && .jenkins/pytorch/test.sh' + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 5047246453a..e39b8cb8fc3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -91,6 +91,23 @@ jobs: run: | python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch." + templates: + runs-on: ubuntu-18.04 + steps: + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.x + architecture: x64 + - name: Install Jinja2 + run: pip install Jinja2 + - name: Checkout PyTorch + uses: actions/checkout@v2 + - name: Regenerate workflows + run: .github/scripts/generate_linux_ci_workflows.py + - name: Assert that regenerating the workflows didn't change them + run: .github/scripts/report_git_status.sh + toc: runs-on: ubuntu-18.04 # https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687 @@ -111,12 +128,7 @@ jobs: markdown-toc --bullets='-' -i "$FILE" done - name: Assert that regenerating the ToCs didn't change them - run: | - set -eux - CHANGES=$(git status --porcelain) - echo "$CHANGES" - git diff - [ -z "$CHANGES" ] + run: .github/scripts/report_git_status.sh flake8-py3: runs-on: ubuntu-18.04 diff --git a/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml new file mode 100644 index 00000000000..d799a9aeebd --- /dev/null +++ b/.github/workflows/pytorch-linux-xenial-py3.6-gcc5.4.yml @@ -0,0 +1,174 @@ +# @generated by .github/scripts/generate_linux_ci_workflows.py, Do not update manually +# +# Template is at: .github/templates/linux_ci_workflow.yml +# Generation script: .github/scripts/generate_linux_ci_workflows.py +name: Linux CI (pytorch-linux-xenial-py3.6-gcc5.4) + +on: + # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers + # pull_request: + push: + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: pytorch-linux-xenial-py3.6-gcc5.4 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + +jobs: + calculate-docker-image: + runs-on: ubuntu-18.04 + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: Checkout PyTorch + uses: actions/checkout@v2 + - name: Calculate docker image tag + id: calculate-tag + run: | + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + build: + runs-on: linux.2xlarge + needs: calculate-docker-image + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + steps: + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + - name: Checkout PyTorch + uses: actions/checkout@v2 + with: + fetch-depth: 0 # deep clone, to allow sharding to use git rev-list + submodules: recursive + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch + run: | + SCCACHE_MAX_JOBS=$(( $(nproc) - 1 )) + MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM + export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS )) + docker run \ + -e BUILD_ENVIRONMENT \ + -e MAX_JOBS \ + -e SCCACHE_BUCKET \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -q -r artifacts.zip dist build + - uses: actions/upload-artifact@v2 + name: Store PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 30 + if-no-files-found: error + path: + artifacts.zip + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + test: + runs-on: linux.2xlarge + needs: + - calculate-docker-image + - build + env: + DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }} + steps: + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + - name: Checkout PyTorch + uses: actions/checkout@v2 + - name: Log in to ECR + run: | + aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh + bash /tmp/ecr-login.sh + rm /tmp/ecr-login.sh + - name: Pull docker image + run: | + docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: actions/download-artifact@v2 + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -q artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Test PyTorch + run: | + SCCACHE_MAX_JOBS=$(( $(nproc) - 1 )) + MEMORY_LIMIT_MAX_JOBS=8 # our "linux.2xlarge" runner has 16 vCPUs, if we use all of them we'll OOM + export MAX_JOBS=$(( SCCACHE_MAX_JOBS > MEMORY_LIMIT_MAX_JOBS ? MEMORY_LIMIT_MAX_JOBS : SCCACHE_MAX_JOBS )) + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086 + docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e IN_CI \ + -e MAX_JOBS \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + sh -c 'sudo chown -R jenkins . && pip install dist/*.whl && .jenkins/pytorch/test.sh' + - name: Clean up docker images + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v alpine chown -R "$(id -u):$(id -g)" . + # Prune all of the docker images + docker system prune -af diff --git a/.gitignore b/.gitignore index 3210a8ce062..01cd062c0f5 100644 --- a/.gitignore +++ b/.gitignore @@ -288,3 +288,12 @@ TAGS # bazel symlinks bazel-* + +# generated shellcheck directories +.shellcheck_generated*/ + +# zip archives +*.zip + +# core dump files +core.* diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 3e6f6178d3d..81ee4516be9 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -59,13 +59,20 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then export BUILD_SPLIT_CUDA=ON fi +if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then + export BUILD_CAFFE2=OFF +fi + +if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + export ATEN_THREADING=TBB + export USE_TBB=1 +elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + export ATEN_THREADING=NATIVE +fi + # TODO: Don't run this... pip_install -r requirements.txt || true -# Enable LLVM dependency for TensorExpr testing -export USE_LLVM=/opt/llvm -export LLVM_DIR=/opt/llvm/lib/cmake/llvm - # TODO: Don't install this here if ! which conda; then # In ROCm CIs, we are doing cross compilation on build machines with @@ -229,40 +236,6 @@ else cp build/.ninja_log dist fi - # Build custom operator tests. - CUSTOM_OP_BUILD="$PWD/../custom-op-build" - CUSTOM_OP_TEST="$PWD/test/custom_operator" - python --version - SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - mkdir "$CUSTOM_OP_BUILD" - pushd "$CUSTOM_OP_BUILD" - cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" - make VERBOSE=1 - popd - assert_git_not_dirty - - # Build jit hook tests - JIT_HOOK_BUILD="$PWD/../jit-hook-build" - JIT_HOOK_TEST="$PWD/test/jit_hooks" - python --version - SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - mkdir "$JIT_HOOK_BUILD" - pushd "$JIT_HOOK_BUILD" - cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" - make VERBOSE=1 - popd - assert_git_not_dirty - - # Build custom backend tests. - CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build" - CUSTOM_BACKEND_TEST="$PWD/test/custom_backend" - python --version - mkdir "$CUSTOM_BACKEND_BUILD" - pushd "$CUSTOM_BACKEND_BUILD" - cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" - make VERBOSE=1 - popd - assert_git_not_dirty else # Test standalone c10 build if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda10.1-cudnn7-py3* ]]; then diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh index 57cb554b819..1755efee03f 100644 --- a/.jenkins/pytorch/common.sh +++ b/.jenkins/pytorch/common.sh @@ -72,7 +72,16 @@ if [[ "$BUILD_ENVIRONMENT" != *pytorch-win-* ]]; then # Save sccache logs to file sccache --stop-server || true rm ~/sccache_error.log || true - if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then + if [[ -n "${SKIP_SCCACHE_INITIALIZATION:-}" ]]; then + # sccache --start-server seems to hang forever on self hosted runners for GHA + # so let's just go ahead and skip the --start-server altogether since it seems + # as though sccache still gets used even when the sscache server isn't started + # explicitly + echo "Skipping sccache server initialization, setting environment variables" + export SCCACHE_IDLE_TIMEOUT=1200 + export SCCACHE_ERROR_LOG=~/sccache_error.log + export RUST_LOG=sccache::server=error + elif [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server else # increasing SCCACHE_IDLE_TIMEOUT so that extension_backend_test.cpp can build after this PR: @@ -147,3 +156,7 @@ fi retry () { "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } + +# Enable LLVM dependency for TensorExpr testing +export USE_LLVM=/opt/llvm +export LLVM_DIR=/opt/llvm/lib/cmake/llvm diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 45051e69719..d1ae0252584 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -51,7 +51,11 @@ test_python_all() { export GLOO_SOCKET_IFNAME=lo0 echo "Ninja version: $(ninja --version)" - if [ -n "$CIRCLE_PULL_REQUEST" ]; then + # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second + # CIRCLE_PULL_REQUEST comes from CircleCI + # GITHUB_HEAD_REF comes from Github Actions + IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}} + if [ -n "$IN_PULL_REQUEST" ]; then DETERMINE_FROM=$(mktemp) file_diff_from_base "$DETERMINE_FROM" fi diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index f24f9c90d4c..583d7654c20 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -115,7 +115,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX2-* ]]; then export ATEN_CPU_CAPABILITY=avx fi -if [ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then +# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second +# CIRCLE_PULL_REQUEST comes from CircleCI +# GITHUB_HEAD_REF comes from Github Actions +IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}} +if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then DETERMINE_FROM=$(mktemp) file_diff_from_base "$DETERMINE_FROM" fi @@ -257,6 +261,18 @@ test_rpc() { test_custom_backend() { if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then + echo "Building custom backends tests" + # Build custom backend tests. + CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build" + CUSTOM_BACKEND_TEST="$PWD/test/custom_backend" + python --version + mkdir "$CUSTOM_BACKEND_BUILD" + pushd "$CUSTOM_BACKEND_BUILD" + cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" + make VERBOSE=1 + popd + assert_git_not_dirty + echo "Testing custom backends" CUSTOM_BACKEND_BUILD="$PWD/../custom-backend-build" pushd test/custom_backend @@ -274,6 +290,19 @@ test_custom_backend() { test_custom_script_ops() { if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then + # Build custom operator tests. + echo "Building custom script operators tests" + CUSTOM_OP_BUILD="$PWD/../custom-op-build" + CUSTOM_OP_TEST="$PWD/test/custom_operator" + python --version + SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + mkdir "$CUSTOM_OP_BUILD" + pushd "$CUSTOM_OP_BUILD" + cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" + make VERBOSE=1 + popd + assert_git_not_dirty + echo "Testing custom script operators" CUSTOM_OP_BUILD="$PWD/../custom-op-build" pushd test/custom_operator @@ -290,6 +319,19 @@ test_custom_script_ops() { test_jit_hooks() { if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then + echo "Building jit hooks in cpp tests" + # Build jit hook tests + JIT_HOOK_BUILD="$PWD/../jit-hook-build" + JIT_HOOK_TEST="$PWD/test/jit_hooks" + python --version + SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" + mkdir "$JIT_HOOK_BUILD" + pushd "$JIT_HOOK_BUILD" + cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" + make VERBOSE=1 + popd + assert_git_not_dirty + echo "Testing jit hooks in cpp" HOOK_BUILD="$PWD/../jit-hook-build" pushd test/jit_hooks diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh index 9b2b05403f8..6152021099c 100755 --- a/.jenkins/pytorch/win-test.sh +++ b/.jenkins/pytorch/win-test.sh @@ -42,12 +42,16 @@ fi export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers -if [ -n "$CIRCLE_PULL_REQUEST" ]; then +# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second +# CIRCLE_PULL_REQUEST comes from CircleCI +# GITHUB_HEAD_REF comes from Github Actions +IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}} +if [ -n "$IN_PULL_REQUEST" ]; then DETERMINE_FROM="${TMP_DIR}/determine_from" file_diff_from_base "$DETERMINE_FROM" fi -if [[ "${CIRCLE_JOB}" == *11* ]]; then +if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then export BUILD_SPLIT_CUDA=ON fi diff --git a/Makefile b/Makefile index 13755ce544c..3fe69bf14cf 100644 --- a/Makefile +++ b/Makefile @@ -14,8 +14,19 @@ ios: clean: # This will remove ALL build folders. @rm -r build*/ + @$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER) linecount: @cloc --read-lang-def=caffe.cloc caffe2 || \ echo "Cloc is not available on the machine. You can install cloc with " && \ echo " sudo apt-get install cloc" + +SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha +shellcheck-gha: + @$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER) + tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER) + tools/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER) + +generate-gha-workflows: + ./.github/scripts/generate_linux_ci_workflows.py + $(MAKE) shellcheck-gha