pytorch/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml

# @generated DO NOT EDIT MANUALLY
# Template is at:    .github/templates/linux_ci_workflow.yml.j2
# Generation script: .github/scripts/generate_ci_workflows.py
name: pytorch-xla-linux-bionic-py3.7-clang8

on:
  push:
    tags:
      - 'ciflow/all/*'
      - 'ciflow/cpu/*'
      - 'ciflow/linux/*'
      - 'ciflow/trunk/*'
      - 'ciflow/xla/*'
    branches:
      - master
      - release/*
  workflow_dispatch:

env:
  BUILD_ENVIRONMENT: pytorch-xla-linux-bionic-py3.7-clang8
  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
  TORCH_CUDA_ARCH_LIST: 5.2
  IN_CI: 1
  IS_GHA: 1
  # This is used for the phase of adding wheel tests only, will be removed once completed
  IN_WHEEL_TEST: 1
  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  AWS_DEFAULT_REGION: us-east-1
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  PYTORCH_RETRY_TEST_CASES: 1
  # This is used for XLA tests only
  XLA_CUDA: 0
  XLA_IMAGE_TAG: v0.2
concurrency:
  group: pytorch-xla-linux-bionic-py3.7-clang8-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

jobs:

  build:
    runs-on: linux.2xlarge
    timeout-minutes: 240
    env:
      JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-build
    outputs:
      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
    steps:
      - name: print labels
        run: echo "${PR_LABELS}"
      - name: Display EC2 information
        shell: bash
        run: |
          set -euo pipefail
          function get_ec2_metadata() {
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
          echo "instance-type: $(get_ec2_metadata instance-type)"
      - name: Log in to ECR
        env:
          AWS_RETRY_MODE: standard
          AWS_MAX_ATTEMPTS: 5
        run: |
          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
          retry () {
              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
          }
          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
      - name: Chown workspace
        run: |
          retry () {
              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
          }
          retry docker pull "${ALPINE_IMAGE}"
          # Ensure the working directory gets chowned back to the current user
          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Clean workspace
        run: |
          rm -rf "${GITHUB_WORKSPACE}"
          mkdir "${GITHUB_WORKSPACE}"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Preserve github env variables for use in docker
        run: |
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Checkout PyTorch
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          # deep clone, to allow use of git merge-base
          fetch-depth: 0
          submodules: recursive
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
      - name: Calculate docker image tag
        id: calculate-tag
        run: |
          echo "XLA workflow uses pre-built test image at ${XLA_IMAGE_TAG}"
          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" >> "${GITHUB_ENV}"
          echo "::set-output name=docker_tag::${DOCKER_TAG}"
          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}"
      - name: Pull Docker image
        run: |
          retry () {
              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
          }
          retry docker pull "${DOCKER_IMAGE}"
      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py
      - name: Build
        env:
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
        run: |
          # detached container should get cleaned up by teardown_ec2_linux
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e JOB_BASE_NAME \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
            -e IS_GHA \
            -e PR_NUMBER \
            -e SHA1 \
            -e BRANCH \
            -e GITHUB_RUN_ID \
            -e SCCACHE_BUCKET \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
            -e TORCH_CUDA_ARCH_LIST \
            -e PR_LABELS \
            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
            --user jenkins \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}"
          )
          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
      - name: Display and upload binary build size statistics (Click Me)
        # temporary hack: set CIRCLE_* vars, until we update
        # tools/stats/print_test_stats.py to natively support GitHub Actions
        env:
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          TAG: ${{ steps.parse-ref.outputs.tag }}
          WORKFLOW_ID: '${{ github.run_id }}'
        run: |
          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
          export COMMIT_TIME
          pip3 install requests==2.26 boto3==1.16.34
          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Archive artifacts into zip
        run: |
          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
      - uses: seemethere/upload-artifact-s3@v3
        name: Store PyTorch Build Artifacts on S3
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
          retention-days: 14
          if-no-files-found: error
          path:
            artifacts.zip
      - name: Hold runner for 2 hours or until ssh sessions have drained
        # Always hold for active ssh sessions
        if: always()
        run: .github/scripts/wait_for_ssh_to_drain.sh
      - name: Chown workspace
        if: always()
        run: |
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Kill containers, clean up images
        if: always()
        run: |
          # ignore expansion of "docker ps -q" since it could be empty
          # shellcheck disable=SC2046
          docker stop $(docker ps -q) || true
          # Prune all of the docker images
          docker system prune -af
      - name: Hold runner for 2 hours or until ssh sessions have drained
        # Always hold for active ssh sessions
        if: always()
        run: .github/scripts/wait_for_ssh_to_drain.sh
      - name: Clean up docker images
        if: always()
        run: |
          # Prune all of the docker images
          docker system prune -af

  test_xla_1_1:
    name: test (xla, 1, 1, linux.2xlarge)
    needs: build
    runs-on: linux.2xlarge
    timeout-minutes: 240
    env:
      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
      JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test
      TEST_CONFIG: xla
      SHARD_NUMBER: 1
      NUM_TEST_SHARDS: 1
      PR_BODY: ${{ github.event.pull_request.body }}
    steps:
      - name: Display EC2 information
        shell: bash
        run: |
          set -euo pipefail
          function get_ec2_metadata() {
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
          echo "instance-type: $(get_ec2_metadata instance-type)"
      - name: Log in to ECR
        env:
          AWS_RETRY_MODE: standard
          AWS_MAX_ATTEMPTS: 5
        run: |
          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
          retry () {
              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
          }
          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
      - name: Chown workspace
        run: |
          retry () {
              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
          }
          retry docker pull "${ALPINE_IMAGE}"
          # Ensure the working directory gets chowned back to the current user
          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Clean workspace
        run: |
          rm -rf "${GITHUB_WORKSPACE}"
          mkdir "${GITHUB_WORKSPACE}"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: seemethere/add-github-ssh-key@v1
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Preserve github env variables for use in docker
        run: |
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Checkout PyTorch
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          # deep clone, to allow use of git merge-base
          fetch-depth: 0
          submodules: recursive
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
      - name: Pull Docker image
        run: |
          retry () {
              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
          }
          retry docker pull "${DOCKER_IMAGE}"
      - name: Determine shm-size
        run: |
          shm_size="1g"
          case "${BUILD_ENVIRONMENT}" in
            *cuda*)
              shm_size="2g"
              ;;
            *rocm*)
              shm_size="8g"
              ;;
          esac
          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
        name: Download PyTorch Build Artifacts
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
      - name: Unzip artifacts
        run: |
          unzip -o artifacts.zip
      - name: Output disk space left
        run: |
          sudo df -H
      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py
      - name: Test
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
        # Time out the test phase after 240 minutes
        timeout-minutes: 240
        run: |
          set -x

          if [[ $TEST_CONFIG == 'multigpu' ]]; then
            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
            TEST_COMMAND=.jenkins/caffe2/test.sh
          else
            TEST_COMMAND=.jenkins/pytorch/test.sh
          fi
          PROXY_ENV=
          # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now
          #       We should investigate whether or not there's a list of hostnames we can add to no_proxy to
          #       make it so that we shouldn't have to fully disable squid for XLA tests
          if [[ $TEST_CONFIG != 'xla' ]]; then
            # shellcheck disable=SC2089
            PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock"
          fi
          # detached container should get cleaned up by teardown_ec2_linux
          # TODO: Stop building test binaries as part of the build phase
          # Used for GPU_FLAG since that doesn't play nice
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
            -e GITHUB_ACTIONS \
            -e IN_CI \
            -e IS_GHA \
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
            -e IN_WHEEL_TEST \
            -e SHARD_NUMBER \
            -e JOB_BASE_NAME \
            -e TEST_CONFIG \
            -e NUM_TEST_SHARDS \
            -e PR_BODY \
            -e PYTORCH_RETRY_TEST_CASES \
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            ${PROXY_ENV} \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
            --ipc=host \
            --shm-size="${SHM_SIZE}" \
            --tty \
            --detach \
            --name="${container_name}" \
            --user jenkins \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}"
          )
          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
      - name: Chown workspace
        if: always()
        run: |
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Install render_test_results dependencies
        if: always()
        shell: bash
        run: |
          python3 -m pip install junitparser==2.1.1 rich==10.9.0
      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
        if: always()
        shell: bash
        # Encoding is weird on windows, just try to default to utf-8 if possible
        env:
          PYTHONIOENCODING: "utf-8"
        run: |
          python3 tools/render_junit.py test/
      - name: Zip JSONs for upload
        if: always()
        env:
          FILE_SUFFIX: '${{ github.job }}-xla-1-1-linux.2xlarge'
        run: |
          # Remove any previous test jsons if they exist
          rm -f test-jsons-*.zip
          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
      - uses: seemethere/upload-artifact-s3@v3
        name: Store Test Downloaded JSONs on S3
        if: always()
        with:
          retention-days: 14
          if-no-files-found: warn
          path:
            test-jsons-*.zip
      - name: Zip test reports for upload
        if: always()
        env:
          FILE_SUFFIX: '${{ github.job }}-xla-1-1-linux.2xlarge'
        run: |
          # Remove any previous test reports if they exist
          rm -f test-reports-*.zip
          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
      - uses: seemethere/upload-artifact-s3@v3
        name: Store Test Reports on S3
        if: always()
        with:
          retention-days: 14
          if-no-files-found: error
          path:
            test-reports-*.zip
      - name: Upload test statistics
        if: always()
        env:
          AWS_DEFAULT_REGION: us-east-1
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test
          PR_NUMBER: ${{ github.event.pull_request.number }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          TAG: ${{ steps.parse-ref.outputs.tag }}
          WORKFLOW_ID: '${{ github.run_id }}'
        shell: bash
        run: |
          python3 -m pip install -r requirements.txt
          python3 -m pip install boto3==1.19.12
          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
      - name: Hold runner for 2 hours or until ssh sessions have drained
        # Always hold for active ssh sessions
        if: always()
        run: .github/scripts/wait_for_ssh_to_drain.sh
      - name: Chown workspace
        if: always()
        run: |
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
      - name: Kill containers, clean up images
        if: always()
        run: |
          # ignore expansion of "docker ps -q" since it could be empty
          # shellcheck disable=SC2046
          docker stop $(docker ps -q) || true
          # Prune all of the docker images
          docker system prune -af