[Bazel] Add CUDA build to CI (#66241)

Summary: Fixes https://github.com/pytorch/pytorch/issues/35316 On master, bazel cuda build is disabled due to lack of a proper `cu_library` rule. This PR: - Add `rules_cuda` to the WORKSPACE and forward `cu_library` to `rules_cuda`. - Use a simple local cuda and cudnn repositories (adopted from TRTorch) for cuda 11.3. - Fix current broken cuda build. - Enable cuda build in CI, not just for `:torch` target but all the test binaries to catch undefined symbols. Pull Request resolved: https://github.com/pytorch/pytorch/pull/66241 Reviewed By: ejguan Differential Revision: D31544091 Pulled By: malfet fbshipit-source-id: fd3c34d0e8f80fee06f015694a4c13a8e9e12206
2025-12-06 12:20:52 +01:00 · 2021-12-17 13:41:24 -08:00 · 2021-12-17 13:41:24 -08:00 · e35bf56461
commit e35bf56461
parent e0f4e28c69
17 changed files with 767 additions and 828 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -1,6 +1,7 @@
 build --copt=--std=c++14
 build --copt=-I.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
+build --experimental_ui_max_stdouterr_bytes=2048576

 # Configuration to disable tty features for environments like CI
 build:no-tty --curses no
@ -11,3 +12,8 @@ build:no-tty --show_progress_rate_limit 10
 build:gpu --define=cuda=true
 # define a separate build folder for faster switching between configs
 build:gpu --platform_suffix=-gpu
+# rules_cuda configuration
+build:gpu --@rules_cuda//cuda:enable_cuda
+build:gpu --@rules_cuda//cuda:cuda_targets=sm_52
+build:gpu --@rules_cuda//cuda:compiler=nvcc
+build:gpu --repo_env=CUDA_PATH=/usr/local/cuda
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@ -20,13 +20,13 @@
      "linux-docs-push",
      "linux-vulkan-bionic-py3.6-clang9",
      "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
      "linux-xenial-py3-clang5-mobile-build",
      "linux-xenial-py3-clang5-mobile-custom-build-static",
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
      "macos-10-15-py3-arm64",
      "macos-10-15-py3-lite-interpreter-x86-64",
      "macos-11-py3-x86-64",
@ -48,7 +48,7 @@
      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
    ],
    "ciflow/bazel": [
-      "linux-xenial-py3.6-gcc7-bazel-test"
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test"
    ],
    "ciflow/cpu": [
      "caffe2-linux-xenial-py3.6-gcc5.4",
@ -56,11 +56,11 @@
      "linux-docs",
      "linux-docs-push",
      "linux-vulkan-bionic-py3.6-clang9",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
      "parallelnative-linux-xenial-py3.6-gcc5.4",
      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
@ -85,13 +85,13 @@
      "linux-docs",
      "linux-vulkan-bionic-py3.6-clang9",
      "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
      "linux-xenial-py3-clang5-mobile-build",
      "linux-xenial-py3-clang5-mobile-custom-build-static",
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
      "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit",
      "win-vs2019-cpu-py3",
@ -126,13 +126,13 @@
      "linux-docs-push",
      "linux-vulkan-bionic-py3.6-clang9",
      "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
      "linux-xenial-py3-clang5-mobile-build",
      "linux-xenial-py3-clang5-mobile-custom-build-static",
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
      "parallelnative-linux-xenial-py3.6-gcc5.4",
      "periodic-libtorch-linux-bionic-cuda11.5-py3.6-gcc7",
      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
@ -203,13 +203,13 @@
      "linux-docs",
      "linux-vulkan-bionic-py3.6-clang9",
      "linux-xenial-cuda11.3-py3.6-gcc7",
+      "linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
      "linux-xenial-py3-clang5-mobile-build",
      "linux-xenial-py3-clang5-mobile-custom-build-static",
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
      "linux-xenial-py3.6-gcc7",
-      "linux-xenial-py3.6-gcc7-bazel-test",
      "macos-10-15-py3-arm64",
      "macos-10-15-py3-lite-interpreter-x86-64",
      "macos-11-py3-x86-64",
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -609,8 +609,8 @@ ANDROID_WORKFLOWS = [
 BAZEL_WORKFLOWS = [
    CIWorkflow(
        arch="linux",
-        build_environment="linux-xenial-py3.6-gcc7-bazel-test",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
+        build_environment="linux-xenial-cuda11.3-py3.6-gcc7-bazel-test",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7",
        test_runner_type=LINUX_CPU_TEST_RUNNER,
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX},
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7-bazel-test.yml
@ -0,0 +1,331 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/bazel_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+      - fbsync
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  IS_GHA: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  AWS_DEFAULT_REGION: us-east-1
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  PYTORCH_RETRY_TEST_CASES: 1
+concurrency:
+  group: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+
+  ciflow_should_run:
+    runs-on: linux.large
+    timeout-minutes: 240
+    env:
+      IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') }}
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+    if: ${{ (github.repository == 'pytorch/pytorch') && (
+            (github.event_name == 'push') ||
+            (github.event_name == 'schedule') ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/bazel') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) || contains(github.event.pull_request.labels.*.name, 'ciflow/trunk') ||
+            ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
+         }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
+
+  # building and testing in a single job since bazel runs only small subset of tests
+  build-and-test:
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-build-and-test
+      NUM_TEST_SHARDS: 1
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        env:
+          AWS_RETRY_MODE: standard
+          AWS_MAX_ATTEMPTS: 5
+        run: |
+          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+          aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
+              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
+          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_SKIP_S3_UPLOAD: 1
+        working-directory: .circleci/docker
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          ./build_docker.sh
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Build
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e PR_LABELS \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh'
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests==2.26 boto3==1.16.34
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Test
+        # Time out the test phase after 3.5 hours
+        timeout-minutes: 210
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          export SHARD_NUMBER=0
+          # TODO: Stop building test binaries as part of the build phase
+          # Make sure we copy test results from bazel-testlogs symlink to
+          # a regular directory ./test/test-reports
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e SHARD_NUMBER \
+            -e NUM_TEST_SHARDS \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports'
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Zip test reports for upload
+        if: always()
+        env:
+          FILE_SUFFIX: 'bazel-${{ github.job }}'
+        run: |
+          # Remove any previous test reports if they exist
+          rm -f test-reports-*.zip
+          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store Test Reports on S3
+        if: always()
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - name: Zip JSONs for upload
+        if: always()
+        env:
+          FILE_SUFFIX: 'bazel-${{ github.job }}'
+        run: |
+          # Remove any previous test jsons if they exist
+          rm -f test-jsons-*.zip
+          zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store Test Downloaded JSONs on S3
+        if: always()
+        with:
+          retention-days: 14
+          if-no-files-found: warn
+          path:
+            test-jsons-*.zip
+      - name: Display and upload test statistics (Click Me)
+        if: always()
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: linux-xenial-cuda11.3-py3.6-gcc7-bazel-test-test
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.19.12
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -207,11 +207,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then

  get_bazel

-  # first build the whole torch for CPU-only
+  # first build torch for CPU-only
  tools/bazel build --config=no-tty :torch
-  # then build selected set of targets with GPU-support.
-  # TODO: eventually this should converge to building the whole :torch with GPU-support
-  tools/bazel build --config=no-tty --config=gpu //c10
+  # then build everything with CUDA
+  tools/bazel build --config=no-tty --config=gpu :all
 else
  # check that setup.py would fail with bad arguments
  echo "The next three invocations are expected to fail with invalid command error messages."
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -3,7 +3,7 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_proto//proto:defs.bzl", "proto_library")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test")
 load("//third_party:substitution.bzl", "header_template_rule")
-load("//:tools/build_variables.bzl", "torch_cpp_srcs", "libtorch_python_core_sources", "libtorch_core_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "jit_core_sources")
+load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs")
 load("//tools/rules:cu.bzl", "cu_library")
 load("//tools/config:defs.bzl", "if_cuda")
 load("//:aten.bzl", "intern_build_aten_ops", "generate_aten")
@ -15,6 +15,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DHAVE_GCC_GET_CPUID",
+    "-DTH_BLAS_MKL",
    "-DUSE_GCC_GET_CPUID",
    "-DTH_HAVE_THREAD",
    "-DUSE_FBGEMM",
@ -37,11 +38,11 @@ py_binary(
    ],
 )

+aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + glob(["aten/src/ATen/templates/**"])
+
 generate_aten(
    name = "generated_cpp",
-    srcs = [
-        "aten/src/ATen/native/native_functions.yaml",
-    ] + glob(["aten/src/ATen/templates/**"]),
+    srcs = aten_generation_srcs,
    outs = [
        "aten/src/ATen/Declarations.yaml",
        "aten/src/ATen/RegisterBackendSelect.cpp",
@ -62,8 +63,6 @@ generate_aten(
        "aten/src/ATen/RegisterSchema.cpp",
        "aten/src/ATen/CPUFunctions.h",
        "aten/src/ATen/CPUFunctions_inl.h",
-        "aten/src/ATen/CUDAFunctions.h",
-        "aten/src/ATen/CUDAFunctions_inl.h",
        "aten/src/ATen/CompositeExplicitAutogradFunctions.h",
        "aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
        "aten/src/ATen/CompositeImplicitAutogradFunctions.h",
@ -82,6 +81,8 @@ generate_aten(
        "aten/src/ATen/MetaFunctions.h",
        "aten/src/ATen/MetaFunctions_inl.h",
        "aten/src/ATen/MethodOperators.h",
+        "aten/src/ATen/NativeMetaFunctions.h",
+        "aten/src/ATen/RegistrationDeclarations.h",
        "aten/src/ATen/core/TensorBody.h",
        "aten/src/ATen/core/TensorMethods.cpp",
        "aten/src/ATen/core/ATenOpList.cpp",
@ -89,6 +90,23 @@ generate_aten(
    generator=":gen",
 )

+# this hack is due to https://github.com/bazelbuild/bazel/issues/281
+# since `outs` cannot be configured with if_cuda, we rerun the same command and declare cuda related files separately here.
+genrule(
+    name = "generated_cuda_cpp",
+    srcs = aten_generation_srcs,
+    outs = [
+        "aten/src/ATen/CUDAFunctions.h",
+        "aten/src/ATen/CUDAFunctions_inl.h",
+        "aten/src/ATen/RegisterCUDA.cpp",
+        "aten/src/ATen/RegisterQuantizedCUDA.cpp",
+        "aten/src/ATen/RegisterSparseCUDA.cpp",
+        "aten/src/ATen/RegisterSparseCsrCUDA.cpp",
+    ],
+    cmd = "$(location :gen) --source-path `dirname $(location aten/src/ATen/native/native_functions.yaml)`/.. --install_dir `dirname $(location aten/src/ATen/RegisterCUDA.cpp)`",
+    tools = [":gen"],
+)
+
 py_library(
    name = "tools_codegen",
    srcs = glob(["tools/codegen/**/*.py"]),
@ -230,7 +248,7 @@ filegroup(

 filegroup(
    name = "aten_native_mkl_cpp",
-    srcs = glob(["aten/src/ATen/native/mkl/*.cpp"]),
+    srcs = glob(["aten/src/ATen/native/mkl/*.cpp", "aten/src/ATen/mkl/*.cpp"]),
 )

 filegroup(
@ -266,135 +284,40 @@ filegroup(
 )

 filegroup(
-    name = "aten_cuda_srcs",
-    srcs = [
-        "aten/src/ATen/cuda/CUDABlas.cpp",
-        "aten/src/ATen/cuda/CUDASolver.cpp",
-        "aten/src/ATen/cuda/CUDAContext.cpp",
-        "aten/src/ATen/cuda/CUDAGeneratorImpl.cpp",
-        "aten/src/ATen/cuda/CUDAGraph.cpp",
-        "aten/src/ATen/cuda/CuSparseHandlePool.cpp",
-        "aten/src/ATen/cuda/CublasHandlePool.cpp",
-        "aten/src/ATen/cuda/CusolverDnHandlePool.cpp",
-        "aten/src/ATen/cuda/PinnedMemoryAllocator.cpp",
-        "aten/src/ATen/cuda/detail/CUDAHooks.cpp",
-        "aten/src/ATen/cudnn/AutocastRNN.cpp",
-        "aten/src/ATen/cudnn/Descriptors.cpp",
-        "aten/src/ATen/cudnn/Handle.cpp",
-        "aten/src/ATen/cudnn/Types.cpp",
-        "aten/src/ATen/native/cuda/CUDAUnaryOps.cpp",
-        "aten/src/ATen/native/cuda/TensorShapeCUDA.cpp",
-        "aten/src/ATen/native/cudnn/AffineGridGenerator.cpp",
-        "aten/src/ATen/native/cudnn/BatchNorm.cpp",
-        "aten/src/ATen/native/cudnn/Conv.cpp",
-        "aten/src/ATen/native/cudnn/GridSampler.cpp",
-        "aten/src/ATen/native/cudnn/LossCTC.cpp",
-        "aten/src/ATen/native/cudnn/RNN.cpp",
-        "aten/src/ATen/native/miopen/BatchNorm_miopen.cpp",
-        "aten/src/ATen/native/miopen/Conv_miopen.cpp",
-        "aten/src/ATen/native/miopen/RNN_miopen.cpp",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp",
-        "aten/src/ATen/native/sparse/cuda/SparseBlas.cpp",
-        "aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp",
-    ],
+    name = "aten_cuda_cpp_srcs",
+    srcs = glob(
+        [
+            "aten/src/ATen/cuda/*.cpp",
+            "aten/src/ATen/cuda/detail/*.cpp",
+            "aten/src/ATen/cudnn/*.cpp",
+            "aten/src/ATen/native/cuda/*.cpp",
+            "aten/src/ATen/native/cudnn/*.cpp",
+            "aten/src/ATen/native/miopen/*.cpp",
+            "aten/src/ATen/native/sparse/cuda/*.cpp",
+            "aten/src/THC/*.cpp",
+        ],
+    ),
 )

 filegroup(
-    name = "aten_srcs_cu",
-    srcs = [
-        "aten/src/ATen/cuda/cub.cu.cc",
-        "aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
-        "aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
-        "aten/src/ATen/native/cuda/Activation.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu.cc",
-        "aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu.cc",
-        "aten/src/ATen/native/cuda/AveragePool2d.cu.cc",
-        "aten/src/ATen/native/cuda/AveragePool3d.cu.cc",
-        "aten/src/ATen/native/cuda/BatchLinearAlgebra.cu.cc",
-        "aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cu.cc",
-        "aten/src/ATen/native/cuda/BinaryArithmeticKernel.cu.cc",
-        "aten/src/ATen/native/cuda/BinaryCompareKernel.cu.cc",
-        "aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu.cc",
-        "aten/src/ATen/native/cuda/CUDAScalar.cu.cc",
-        "aten/src/ATen/native/cuda/Col2Im.cu.cc",
-        "aten/src/ATen/native/cuda/Copy.cu.cc",
-        "aten/src/ATen/native/cuda/CrossKernel.cu.cc",
-        "aten/src/ATen/native/cuda/DilatedMaxPool2d.cu.cc",
-        "aten/src/ATen/native/cuda/DilatedMaxPool3d.cu.cc",
-        "aten/src/ATen/native/cuda/DistanceKernel.cu.cc",
-        "aten/src/ATen/native/cuda/Distributions.cu.cc",
-        "aten/src/ATen/native/cuda/Dropout.cu.cc",
-        "aten/src/ATen/native/cuda/Embedding.cu.cc",
-        "aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu.cc",
-        "aten/src/ATen/native/cuda/EmbeddingBag.cu.cc",
-        "aten/src/ATen/native/cuda/FillKernel.cu.cc",
-        "aten/src/ATen/native/cuda/FractionalMaxPool2d.cu.cc",
-        "aten/src/ATen/native/cuda/FractionalMaxPool3d.cu.cc",
-        "aten/src/ATen/native/cuda/GridSampler.cu.cc",
-        "aten/src/ATen/native/cuda/Im2Col.cu.cc",
-        "aten/src/ATen/native/cuda/IndexKernel.cu.cc",
-        "aten/src/ATen/native/cuda/Indexing.cu.cc",
-        "aten/src/ATen/native/cuda/Lerp.cu.cc",
-        "aten/src/ATen/native/cuda/LinearAlgebra.cu.cc",
-        "aten/src/ATen/native/cuda/Loss.cu.cc",
-        "aten/src/ATen/native/cuda/LossCTC.cu.cc",
-        "aten/src/ATen/native/cuda/MaxUnpooling.cu.cc",
-        "aten/src/ATen/native/cuda/MultinomialKernel.cu.cc",
-        "aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu.cc",
-        "aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu.cc",
-        "aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu.cc",
-        "aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu.cc",
-        "aten/src/ATen/native/cuda/NLLLoss2d.cu.cc",
-        "aten/src/ATen/native/cuda/Normalization.cu.cc",
-        "aten/src/ATen/native/cuda/PointwiseOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/PowKernel.cu.cc",
-        "aten/src/ATen/native/cuda/RNN.cu.cc",
-        "aten/src/ATen/native/cuda/RangeFactories.cu.cc",
-        "aten/src/ATen/native/cuda/Reduce.cu.cc",
-        "aten/src/ATen/native/cuda/ReduceOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/ReflectionPad.cu.cc",
-        "aten/src/ATen/native/cuda/Repeat.cu.cc",
-        "aten/src/ATen/native/cuda/ReplicationPadding.cu.cc",
-        "aten/src/ATen/native/cuda/Resize.cu.cc",
-        "aten/src/ATen/native/cuda/SegmentReduce.cu.cc",
-        "aten/src/ATen/native/cuda/SoftMax.cu.cc",
-        "aten/src/ATen/native/cuda/SortingKthValue.cu.cc",
-        "aten/src/ATen/native/cuda/SparseMM.cu.cc",
-        "aten/src/ATen/native/cuda/SpectralOps.cu.cc",
-        "aten/src/ATen/native/cuda/SummaryOps.cu.cc",
-        "aten/src/ATen/native/cuda/TensorCompare.cu.cc",
-        "aten/src/ATen/native/cuda/TensorFactories.cu.cc",
-        "aten/src/ATen/native/cuda/TensorTopK.cu.cc",
-        "aten/src/ATen/native/cuda/TensorTransformations.cu.cc",
-        "aten/src/ATen/native/cuda/TriangularOps.cu.cc",
-        "aten/src/ATen/native/cuda/UnaryOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu.cc",
-        "aten/src/ATen/native/cuda/Unique.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleBicubic2d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleBilinear2d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleLinear1d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleNearest1d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleNearest2d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleNearest3d.cu.cc",
-        "aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu.cc",
-        "aten/src/ATen/native/cuda/WeightNorm.cu.cc",
-        "aten/src/ATen/native/cuda/layer_norm_kernel.cu.cc",
-        "aten/src/ATen/native/quantized/cuda/fake_quantize_core.cu.cc",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu.cc",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu.cc",
-        "aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu.cc",
-    ],
+    name = "aten_cu_srcs",
+    srcs = glob([
+        "aten/src/ATen/cuda/*.cu",
+        "aten/src/ATen/cuda/detail/*.cu",
+        "aten/src/ATen/native/cuda/*.cu",
+        "aten/src/ATen/native/quantized/cuda/*.cu",
+        "aten/src/ATen/native/sparse/cuda/*.cu",
+    ]),
 )

 header_template_rule(
    name = "aten_src_ATen_config",
    src = "aten/src/ATen/Config.h.in",
    out = "aten/src/ATen/Config.h",
+    include = "aten/src",
    substitutions = {
        "@AT_MKLDNN_ENABLED@": "1",
-        "@AT_MKL_ENABLED@": "0",
+        "@AT_MKL_ENABLED@": "1",
        "@AT_FFTW_ENABLED@": "0",
        "@AT_POCKETFFT_ENABLED@": "0",
        "@AT_NNPACK_ENABLED@": "0",
@ -413,6 +336,7 @@ header_template_rule(
    name = "aten_src_ATen_cuda_config",
    src = "aten/src/ATen/cuda/CUDAConfig.h.in",
    out = "aten/src/ATen/cuda/CUDAConfig.h",
+    include = "aten/src",
    substitutions = {
        "@AT_CUDNN_ENABLED@": "1",
        "@AT_ROCM_ENABLED@": "0",
@ -429,18 +353,19 @@ cc_library(
    ] + glob([
        "aten/src/**/*.h",
        "aten/src/**/*.hpp",
+        "aten/src/ATen/cuda/**/*.cuh",
+        "aten/src/ATen/native/**/*.cuh",
        "aten/src/TH/**/*.cpp",
        "aten/src/THC/*.cuh",
+        "aten/src/THC/generic/*.cu",
    ],
-    exclude = [
-        "aten/src/ATen/Config.h",
-    ],) + [
-        ":generated_cpp",
+    ) + [
        ":aten_src_ATen_config",
+        ":generated_cpp",
+        ":generated_cuda_cpp",
    ],
    includes = [
        "aten/src",
-        "aten/src/TH",
    ],
    deps = [
        "//c10:headers",
@ -464,6 +389,7 @@ intern_build_aten_ops(
        ":aten_headers",
        "@sleef",
        "@fbgemm",
+        "@mkl",
    ],
 )

@ -530,12 +456,17 @@ cc_binary(

 cc_library(
    name = "aten_cuda_cpp",
-    srcs = [":aten_cuda_srcs"],
+    srcs = [
+        ":aten_cuda_cpp_srcs",
+        ":generated_cuda_cpp",
+    ],
+    hdrs = [":aten_src_ATen_cuda_config"],
    copts = ATEN_COPTS,
    visibility = ["//visibility:public"],
    deps = [
        ":aten",
        "@cuda",
+        "@cuda//:cusolver",
        "@cuda//:nvrtc",
        "@cudnn",
    ],
@ -552,9 +483,7 @@ torch_cuda_half_options = [

 cu_library(
    name = "aten_cuda",
-    srcs = [
-        ":aten_srcs_cu",
-    ],
+    srcs = [":aten_cu_srcs"],
    copts = ATEN_COPTS + torch_cuda_half_options,
    visibility = ["//visibility:public"],
    deps = [
@ -618,6 +547,7 @@ header_template_rule(
 filegroup(
    name = "caffe2_contrib_srcs",
    srcs = [
+        "caffe2/contrib/aten/aten_op.cc",
        "caffe2/contrib/gloo/allgather_ops.cc",
        "caffe2/contrib/gloo/allreduce_ops.cc",
        "caffe2/contrib/gloo/barrier_ops.cc",
@ -787,6 +717,7 @@ filegroup(
        "caffe2/operators/conv_op_eigen.cc",
        "caffe2/operators/conv_op_shared.cc",
        "caffe2/operators/conv_transpose_gradient_op.cc",
+        "caffe2/operators/conv_transpose_op.cc",
        "caffe2/operators/conv_transpose_op_mobile.cc",
        "caffe2/operators/copy_op.cc",
        "caffe2/operators/copy_rows_to_tensor_op.cc",
@ -1182,7 +1113,7 @@ filegroup(
 )

 filegroup(
-    name = "caffe2_cuda_srcs",
+    name = "caffe2_cuda_cpp_srcs",
    srcs = [
        "caffe2/contrib/aten/aten_op_gpu.cc",
        "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
@ -1251,155 +1182,155 @@ filegroup(
 filegroup(
    name = "caffe2_cu_srcs",
    srcs = [
-        "caffe2/core/context_gpu.cu.cc",
-        "caffe2/operators/abs_op.cu.cc",
-        "caffe2/operators/accumulate_op.cu.cc",
-        "caffe2/operators/accuracy_op.cu.cc",
-        "caffe2/operators/acos_op.cu.cc",
-        "caffe2/operators/affine_channel_op.cu.cc",
-        "caffe2/operators/alias_with_name.cu.cc",
-        "caffe2/operators/arg_ops.cu.cc",
-        "caffe2/operators/asin_op.cu.cc",
-        "caffe2/operators/assert_op.cu.cc",
-        "caffe2/operators/atan_op.cu.cc",
-        "caffe2/operators/batch_gather_ops.cu.cc",
-        "caffe2/operators/batch_matmul_op.cu.cc",
-        "caffe2/operators/batch_moments_op.cu.cc",
-        "caffe2/operators/batch_permutation_op.cu.cc",
-        "caffe2/operators/batch_sparse_to_dense_op.cu.cc",
-        "caffe2/operators/boolean_mask_ops.cu.cc",
-        "caffe2/operators/boolean_unmask_ops.cu.cc",
-        "caffe2/operators/bucketize_op.cu.cc",
-        "caffe2/operators/cast_op.cu.cc",
-        "caffe2/operators/cbrt_op.cu.cc",
-        "caffe2/operators/ceil_op.cu.cc",
-        "caffe2/operators/channel_backprop_stats_op.cu.cc",
-        "caffe2/operators/channel_shuffle_op.cu.cc",
-        "caffe2/operators/channel_stats_op.cu.cc",
-        "caffe2/operators/channelwise_conv3d_op_cudnn.cu.cc",
-        "caffe2/operators/clip_op.cu.cc",
-        "caffe2/operators/copy_op.cu.cc",
-        "caffe2/operators/cos_op.cu.cc",
-        "caffe2/operators/cosh_op.cu.cc",
-        "caffe2/operators/cosine_embedding_criterion_op.cu.cc",
-        "caffe2/operators/cross_entropy_op.cu.cc",
-        "caffe2/operators/cube_op.cu.cc",
-        "caffe2/operators/data_couple_gpu.cu.cc",
-        "caffe2/operators/deform_conv_op.cu.cc",
-        "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu.cc",
-        "caffe2/operators/distance_op.cu.cc",
-        "caffe2/operators/dropout_op.cu.cc",
-        "caffe2/operators/elementwise_div_op.cu.cc",
-        "caffe2/operators/elementwise_linear_op.cu.cc",
-        "caffe2/operators/elementwise_mul_op.cu.cc",
-        "caffe2/operators/elementwise_ops.cu.cc",
-        "caffe2/operators/elu_op.cu.cc",
-        "caffe2/operators/enforce_finite_op.cu.cc",
-        "caffe2/operators/ensure_cpu_output_op.cu.cc",
-        "caffe2/operators/erf_op.cu.cc",
-        "caffe2/operators/filler_op.cu.cc",
-        "caffe2/operators/find_op.cu.cc",
-        "caffe2/operators/floor_op.cu.cc",
-        "caffe2/operators/gather_op.cu.cc",
-        "caffe2/operators/gelu_op.cu.cc",
-        "caffe2/operators/generate_proposals_op.cu.cc",
-        "caffe2/operators/generate_proposals_op_util_nms_gpu.cu.cc",
-        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu.cc",
-        "caffe2/operators/given_tensor_fill_op.cu.cc",
-        "caffe2/operators/glu_op.cu.cc",
-        "caffe2/operators/group_norm_op.cu.cc",
-        "caffe2/operators/gru_unit_op_gpu.cu.cc",
-        "caffe2/operators/half_float_ops.cu.cc",
-        "caffe2/operators/hard_sigmoid_op.cu.cc",
-        "caffe2/operators/instance_norm_op.cu.cc",
-        "caffe2/operators/integral_image_op.cu.cc",
-        "caffe2/operators/layer_norm_op.cu.cc",
-        "caffe2/operators/leaky_relu_op.cu.cc",
-        "caffe2/operators/lengths_pad_op.cu.cc",
-        "caffe2/operators/lengths_tile_op.cu.cc",
-        "caffe2/operators/local_response_normalization_op.cu.cc",
-        "caffe2/operators/logit_op.cu.cc",
-        "caffe2/operators/loss_op.cu.cc",
-        "caffe2/operators/lp_pool_op.cu.cc",
-        "caffe2/operators/lstm_unit_op_gpu.cu.cc",
-        "caffe2/operators/margin_ranking_criterion_op.cu.cc",
-        "caffe2/operators/max_pool_with_index.cu.cc",
-        "caffe2/operators/mean_op.cu.cc",
-        "caffe2/operators/mem_query_op.cu.cc",
-        "caffe2/operators/minmax_ops.cu.cc",
-        "caffe2/operators/moments_op.cu.cc",
-        "caffe2/operators/multi_class_accuracy_op.cu.cc",
-        "caffe2/operators/normalize_ops.cu.cc",
-        "caffe2/operators/one_hot_ops.cu.cc",
-        "caffe2/operators/pack_segments.cu.cc",
-        "caffe2/operators/pad_op_gpu.cu.cc",
-        "caffe2/operators/perplexity_op.cu.cc",
-        "caffe2/operators/piecewise_linear_transform_op.cu.cc",
-        "caffe2/operators/pool_op.cu.cc",
-        "caffe2/operators/pow_op.cu.cc",
-        "caffe2/operators/prelu_op.cu.cc",
-        "caffe2/operators/reciprocal_op.cu.cc",
-        "caffe2/operators/reduce_front_back_max_ops.cu.cc",
-        "caffe2/operators/reduce_front_back_sum_mean_ops.cu.cc",
-        "caffe2/operators/reduce_ops.cu.cc",
-        "caffe2/operators/reduction_ops.cu.cc",
-        "caffe2/operators/relu_n_op.cu.cc",
-        "caffe2/operators/relu_op.cu.cc",
-        "caffe2/operators/replace_nan_op.cu.cc",
-        "caffe2/operators/resize_3d_op.cu.cc",
-        "caffe2/operators/resize_op.cu.cc",
-        "caffe2/operators/reverse_packed_segs_op.cu.cc",
-        "caffe2/operators/rmac_regions_op.cu.cc",
-        "caffe2/operators/rnn/recurrent_network_op_gpu.cu.cc",
-        "caffe2/operators/roi_align_gradient_op.cu.cc",
-        "caffe2/operators/roi_align_op.cu.cc",
-        "caffe2/operators/roi_align_rotated_gradient_op.cu.cc",
-        "caffe2/operators/roi_align_rotated_op.cu.cc",
-        "caffe2/operators/roi_pool_op.cu.cc",
-        "caffe2/operators/rsqrt_op.cu.cc",
-        "caffe2/operators/scale_blobs_op.cu.cc",
-        "caffe2/operators/segment_reduction_op_gpu.cu.cc",
-        "caffe2/operators/selu_op.cu.cc",
-        "caffe2/operators/sequence_ops.cu.cc",
-        "caffe2/operators/sigmoid_op.cu.cc",
-        "caffe2/operators/sin_op.cu.cc",
-        "caffe2/operators/sinh_op.cu.cc",
-        "caffe2/operators/slice_op.cu.cc",
-        "caffe2/operators/softmax_ops.cu.cc",
-        "caffe2/operators/softplus_op.cu.cc",
-        "caffe2/operators/softsign_op.cu.cc",
-        "caffe2/operators/space_batch_op_gpu.cu.cc",
-        "caffe2/operators/sparse_normalize_op_gpu.cu.cc",
-        "caffe2/operators/sparse_to_dense_op.cu.cc",
-        "caffe2/operators/spatial_batch_norm_op.cu.cc",
-        "caffe2/operators/spatial_batch_norm_op_cudnn.cu.cc",
-        "caffe2/operators/stump_func_op.cu.cc",
-        "caffe2/operators/summarize_op.cu.cc",
-        "caffe2/operators/swish_op.cu.cc",
-        "caffe2/operators/tan_op.cu.cc",
-        "caffe2/operators/tanh_op.cu.cc",
-        "caffe2/operators/thresholded_relu_op.cu.cc",
-        "caffe2/operators/tile_op.cu.cc",
-        "caffe2/operators/top_k.cu.cc",
-        "caffe2/operators/transpose_op.cu.cc",
-        "caffe2/operators/unique_ops.cu.cc",
-        "caffe2/operators/upsample_op.cu.cc",
-        "caffe2/operators/utility_ops.cu.cc",
-        "caffe2/operators/weighted_sample_op.cu.cc",
-        "caffe2/sgd/adadelta_op_gpu.cu.cc",
-        "caffe2/sgd/adagrad_op_gpu.cu.cc",
-        "caffe2/sgd/adam_op_gpu.cu.cc",
-        "caffe2/sgd/fp16_momentum_sgd_op.cu.cc",
-        "caffe2/sgd/fp32_momentum_sgd_op.cu.cc",
-        "caffe2/sgd/lars_op_gpu.cu.cc",
-        "caffe2/sgd/momentum_sgd_op_gpu.cu.cc",
-        "caffe2/sgd/rmsprop_op_gpu.cu.cc",
-        "caffe2/sgd/yellowfin_op_gpu.cu.cc",
-        "caffe2/utils/math/broadcast.cu.cc",
-        "caffe2/utils/math/elementwise.cu.cc",
-        "caffe2/utils/math/reduce.cu.cc",
-        "caffe2/utils/math/transpose.cu.cc",
-        "caffe2/utils/math_gpu.cu.cc",
+        "caffe2/core/context_gpu.cu",
+        "caffe2/operators/abs_op.cu",
+        "caffe2/operators/accumulate_op.cu",
+        "caffe2/operators/accuracy_op.cu",
+        "caffe2/operators/acos_op.cu",
+        "caffe2/operators/affine_channel_op.cu",
+        "caffe2/operators/alias_with_name.cu",
+        "caffe2/operators/arg_ops.cu",
+        "caffe2/operators/asin_op.cu",
+        "caffe2/operators/assert_op.cu",
+        "caffe2/operators/atan_op.cu",
+        "caffe2/operators/batch_gather_ops.cu",
+        "caffe2/operators/batch_matmul_op.cu",
+        "caffe2/operators/batch_moments_op.cu",
+        "caffe2/operators/batch_permutation_op.cu",
+        "caffe2/operators/batch_sparse_to_dense_op.cu",
+        "caffe2/operators/boolean_mask_ops.cu",
+        "caffe2/operators/boolean_unmask_ops.cu",
+        "caffe2/operators/bucketize_op.cu",
+        "caffe2/operators/cast_op.cu",
+        "caffe2/operators/cbrt_op.cu",
+        "caffe2/operators/ceil_op.cu",
+        "caffe2/operators/channel_backprop_stats_op.cu",
+        "caffe2/operators/channel_shuffle_op.cu",
+        "caffe2/operators/channel_stats_op.cu",
+        "caffe2/operators/channelwise_conv3d_op_cudnn.cu",
+        "caffe2/operators/clip_op.cu",
+        "caffe2/operators/copy_op.cu",
+        "caffe2/operators/cos_op.cu",
+        "caffe2/operators/cosh_op.cu",
+        "caffe2/operators/cosine_embedding_criterion_op.cu",
+        "caffe2/operators/cross_entropy_op.cu",
+        "caffe2/operators/cube_op.cu",
+        "caffe2/operators/data_couple_gpu.cu",
+        "caffe2/operators/deform_conv_op.cu",
+        "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu",
+        "caffe2/operators/distance_op.cu",
+        "caffe2/operators/dropout_op.cu",
+        "caffe2/operators/elementwise_div_op.cu",
+        "caffe2/operators/elementwise_linear_op.cu",
+        "caffe2/operators/elementwise_mul_op.cu",
+        "caffe2/operators/elementwise_ops.cu",
+        "caffe2/operators/elu_op.cu",
+        "caffe2/operators/enforce_finite_op.cu",
+        "caffe2/operators/ensure_cpu_output_op.cu",
+        "caffe2/operators/erf_op.cu",
+        "caffe2/operators/filler_op.cu",
+        "caffe2/operators/find_op.cu",
+        "caffe2/operators/floor_op.cu",
+        "caffe2/operators/gather_op.cu",
+        "caffe2/operators/gelu_op.cu",
+        "caffe2/operators/generate_proposals_op.cu",
+        "caffe2/operators/generate_proposals_op_util_nms_gpu.cu",
+        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu",
+        "caffe2/operators/given_tensor_fill_op.cu",
+        "caffe2/operators/glu_op.cu",
+        "caffe2/operators/group_norm_op.cu",
+        "caffe2/operators/gru_unit_op_gpu.cu",
+        "caffe2/operators/half_float_ops.cu",
+        "caffe2/operators/hard_sigmoid_op.cu",
+        "caffe2/operators/instance_norm_op.cu",
+        "caffe2/operators/integral_image_op.cu",
+        "caffe2/operators/layer_norm_op.cu",
+        "caffe2/operators/leaky_relu_op.cu",
+        "caffe2/operators/lengths_pad_op.cu",
+        "caffe2/operators/lengths_tile_op.cu",
+        "caffe2/operators/local_response_normalization_op.cu",
+        "caffe2/operators/logit_op.cu",
+        "caffe2/operators/loss_op.cu",
+        "caffe2/operators/lp_pool_op.cu",
+        "caffe2/operators/lstm_unit_op_gpu.cu",
+        "caffe2/operators/margin_ranking_criterion_op.cu",
+        "caffe2/operators/max_pool_with_index.cu",
+        "caffe2/operators/mean_op.cu",
+        "caffe2/operators/mem_query_op.cu",
+        "caffe2/operators/minmax_ops.cu",
+        "caffe2/operators/moments_op.cu",
+        "caffe2/operators/multi_class_accuracy_op.cu",
+        "caffe2/operators/normalize_ops.cu",
+        "caffe2/operators/one_hot_ops.cu",
+        "caffe2/operators/pack_segments.cu",
+        "caffe2/operators/pad_op_gpu.cu",
+        "caffe2/operators/perplexity_op.cu",
+        "caffe2/operators/piecewise_linear_transform_op.cu",
+        "caffe2/operators/pool_op.cu",
+        "caffe2/operators/pow_op.cu",
+        "caffe2/operators/prelu_op.cu",
+        "caffe2/operators/reciprocal_op.cu",
+        "caffe2/operators/reduce_front_back_max_ops.cu",
+        "caffe2/operators/reduce_front_back_sum_mean_ops.cu",
+        "caffe2/operators/reduce_ops.cu",
+        "caffe2/operators/reduction_ops.cu",
+        "caffe2/operators/relu_n_op.cu",
+        "caffe2/operators/relu_op.cu",
+        "caffe2/operators/replace_nan_op.cu",
+        "caffe2/operators/resize_3d_op.cu",
+        "caffe2/operators/resize_op.cu",
+        "caffe2/operators/reverse_packed_segs_op.cu",
+        "caffe2/operators/rmac_regions_op.cu",
+        "caffe2/operators/rnn/recurrent_network_op_gpu.cu",
+        "caffe2/operators/roi_align_gradient_op.cu",
+        "caffe2/operators/roi_align_op.cu",
+        "caffe2/operators/roi_align_rotated_gradient_op.cu",
+        "caffe2/operators/roi_align_rotated_op.cu",
+        "caffe2/operators/roi_pool_op.cu",
+        "caffe2/operators/rsqrt_op.cu",
+        "caffe2/operators/scale_blobs_op.cu",
+        "caffe2/operators/segment_reduction_op_gpu.cu",
+        "caffe2/operators/selu_op.cu",
+        "caffe2/operators/sequence_ops.cu",
+        "caffe2/operators/sigmoid_op.cu",
+        "caffe2/operators/sin_op.cu",
+        "caffe2/operators/sinh_op.cu",
+        "caffe2/operators/slice_op.cu",
+        "caffe2/operators/softmax_ops.cu",
+        "caffe2/operators/softplus_op.cu",
+        "caffe2/operators/softsign_op.cu",
+        "caffe2/operators/space_batch_op_gpu.cu",
+        "caffe2/operators/sparse_normalize_op_gpu.cu",
+        "caffe2/operators/sparse_to_dense_op.cu",
+        "caffe2/operators/spatial_batch_norm_op.cu",
+        "caffe2/operators/spatial_batch_norm_op_cudnn.cu",
+        "caffe2/operators/stump_func_op.cu",
+        "caffe2/operators/summarize_op.cu",
+        "caffe2/operators/swish_op.cu",
+        "caffe2/operators/tan_op.cu",
+        "caffe2/operators/tanh_op.cu",
+        "caffe2/operators/thresholded_relu_op.cu",
+        "caffe2/operators/tile_op.cu",
+        "caffe2/operators/top_k.cu",
+        "caffe2/operators/transpose_op.cu",
+        "caffe2/operators/unique_ops.cu",
+        "caffe2/operators/upsample_op.cu",
+        "caffe2/operators/utility_ops.cu",
+        "caffe2/operators/weighted_sample_op.cu",
+        "caffe2/sgd/adadelta_op_gpu.cu",
+        "caffe2/sgd/adagrad_op_gpu.cu",
+        "caffe2/sgd/adam_op_gpu.cu",
+        "caffe2/sgd/fp16_momentum_sgd_op.cu",
+        "caffe2/sgd/fp32_momentum_sgd_op.cu",
+        "caffe2/sgd/lars_op_gpu.cu",
+        "caffe2/sgd/momentum_sgd_op_gpu.cu",
+        "caffe2/sgd/rmsprop_op_gpu.cu",
+        "caffe2/sgd/yellowfin_op_gpu.cu",
+        "caffe2/utils/math/broadcast.cu",
+        "caffe2/utils/math/elementwise.cu",
+        "caffe2/utils/math/reduce.cu",
+        "caffe2/utils/math/transpose.cu",
+        "caffe2/utils/math_gpu.cu",
    ],
 )

@ -1432,6 +1363,29 @@ cc_library(
    ],
 )

+py_binary(
+    name = "gen_op",
+    srcs = ["caffe2/contrib/aten/gen_op.py"],
+    deps = [":tools_codegen"],
+)
+
+genrule(
+    name = "generated_caffe2_aten_op_headers",
+    srcs = [
+        "caffe2/contrib/aten/aten_op_template.h",
+        "aten/src/ATen/Declarations.yaml",
+    ],
+    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
+    cmd = """
+    $(location :gen_op) \
+        --output_prefix gen_ \
+        --install_dir $(@D) \
+        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
+        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
+        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
+    tools = [":gen_op"],
+)
+
 cc_library(
    name = "caffe2_headers",
    hdrs = glob([
@ -1472,7 +1426,7 @@ cc_library(
    ]) + if_cuda(glob([
        "caffe2/**/*.cuh",
        "caffe2/image/*.h",
-    ])),
+    ])) + [":generated_caffe2_aten_op_headers"],
    copts = CAFFE2_COPTS,
    includes = [
        "caffe2/contrib/aten",
@ -1554,7 +1508,7 @@ cc_library(
        "@fmt",
    ] + if_cuda(
        [
-            ":caffe2_cpp_cuda",
+            ":caffe2_cuda_cpp",
            ":aten_cuda",
            "@tensorpipe//:tensorpipe_cuda",
        ],
@ -1567,8 +1521,8 @@ cc_library(
 )

 cc_library(
-    name = "caffe2_cpp_cuda",
-    srcs = [":caffe2_cuda_srcs"],
+    name = "caffe2_cuda_cpp",
+    srcs = [":caffe2_cuda_cpp_srcs"],
    copts = CAFFE2_COPTS,
    visibility = ["//visibility:public"],
    deps = [
@ -1586,7 +1540,6 @@ cu_library(
    deps = [
        ":aten",
        ":caffe2_headers",
-        "@cub",
        "@cuda//:cublas",
        "@cuda//:curand",
        "@cudnn",
@ -1610,6 +1563,7 @@ PERF_COPTS = [
    "-DHAVE_SHM_OPEN=1",
    "-DHAVE_SHM_UNLINK=1",
    "-DSLEEF_STATIC_LIBS=1",
+    "-DTH_BALS_MKL",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
    "-fvisibility-inlines-hidden",
@ -1693,10 +1647,29 @@ genrule(
    srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"],
    outs = ["torch/csrc/api/include/torch/version.h"],
    cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@",
-    tools = [':gen_version_header']
+    tools = [':gen_version_header'],
 )

-torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
+py_binary(
+    name = "stringify_file",
+    srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"],
+)
+
+generated_nvfuser_hdrs = ["generated_" + hdr for hdr in libtorch_nvfuser_generated_headers]
+
+[
+    genrule(
+        name = name,
+        srcs = [src],
+        outs = ["nvfuser_resources/{}".format(hdr)],
+        cmd = "$(location :stringify_file) -i $< -o $@",
+        tools = [":stringify_file"],
+    )
+    for name, src, hdr in zip(generated_nvfuser_hdrs, libtorch_nvfuser_runtime_sources, libtorch_nvfuser_generated_headers)
+]
+
+torch_cuda_headers = glob(["torch/csrc/cuda/*.h"]) + generated_nvfuser_hdrs
+
 cc_library(
    name = "torch_headers",
    hdrs = if_cuda(
@ -1707,6 +1680,7 @@ cc_library(
            "torch/csrc/**/*.h",
            "torch/csrc/distributed/c10d/*.hpp",
            "torch/lib/libshm/*.h",
+            "torch/csrc/generic/*.cpp",
        ],
        exclude = [
            "torch/csrc/autograd/generated/VariableType.h",
@ -1743,21 +1717,25 @@ TORCH_COPTS = COMMON_COPTS + [
    "-fno-trapping-math",
 ]

+cu_library(
+    name = "torch_distributed_cuda",
+    srcs = ["torch/csrc/distributed/c10d/quantization/quantization_gpu.cu"],
+    deps = [":torch_headers"],
+)
+
 cc_library(
    name = "torch",
    srcs = if_cuda(glob(
-        [
-            "torch/csrc/cuda/*.cpp",
-            "torch/csrc/autograd/functions/comm.cpp",
-        ],
+        libtorch_cuda_sources,
        exclude = [
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
+            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
    )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [
        ":cpp_generated_code",
    ],
-    copts = TORCH_COPTS + if_cuda(["-DUSE_CUDA=1"]),
+    copts = TORCH_COPTS,
    defines = [
        "CAFFE2_NIGHTLY_VERSION=20200115",
    ],
@ -1765,7 +1743,10 @@ cc_library(
    deps = [
        ":caffe2",
        ":torch_headers",
-    ],
+    ] + if_cuda([
+        ":torch_distributed_cuda",
+        "@cuda//:nvToolsExt",
+    ]),
    alwayslink = True,
 )

@ -1783,10 +1764,9 @@ cc_library(
        "**/*.h",
        "**/*.cuh",
    ]) + [
-        ":generated_code",
+        ":cpp_generated_code",
    ],
    includes = [
-        ".",
        "torch/csrc/api/include",
        "torch/csrc/distributed",
        "torch/lib",
@ -1794,21 +1774,17 @@ cc_library(
    ],
    visibility = ["//visibility:public"],
    deps = [
-        ":aten_headers",
-        ":caffe2_headers",
-        "//c10:headers",
+        ":torch_headers",
    ],
 )

 cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources + [":python_generated_code"],
-    hdrs = glob([
-        "torch/csrc/generic/*.cpp",
-    ]),
    deps = [
        ":torch",
        ":shm",
+        "@pybind11",
    ],
 )

@ -1842,11 +1818,16 @@ cc_library(
 # Torch integration tests rely on a labeled data set from the MNIST database.
 # http://yann.lecun.com/exdb/mnist/

-cpp_api_tests = glob(["test/cpp/api/*.cpp"])
+# imethod.cpp is excluded since torch/csrc/deploy* build is not yet supported.
+cpp_api_tests = glob(
+    ["test/cpp/api/*.cpp"],
+    exclude = ["test/cpp/api/imethod.cpp"],
+)
+
 [
  cc_test(
      name = paths.split_extension(paths.basename(filename))[0].replace("-","_") + "_test",
-    size = "medium",
+      size = "medium",
      srcs = [filename],
      deps = [
          ":test_support",
--- a/32
+++ b/32
@ -1,7 +1,22 @@
 workspace(name = "pytorch")

 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")
+load("//tools/rules:workspace.bzl", "new_patched_local_repository")
+
+http_archive(
+    name = "rules_cuda",
+    sha256 = "f80438bee9906e9ecb1a8a4ae2365374ac1e8a283897281a2db2fb7fcf746333",
+    strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda",
+    urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"],
+)
+
+load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
+
+rules_cuda_dependencies()
+
+load("@rules_cc//cc:repositories.bzl", "rules_cc_toolchains")
+
+rules_cc_toolchains()

 http_archive(
    name = "bazel_skylib",
@ -171,13 +186,14 @@ load("@rules_python//python:repositories.bzl", "py_repositories")

 py_repositories()

-local_repository(
-    name = "local_config_cuda",
-    path = "third_party/tensorflow_cuda_bazel_build",
+new_local_repository(
+    name = "cuda",
+    build_file = "@//third_party:cuda.BUILD",
+    path = "/usr/local/cuda",
 )

-# Wrapper to expose local_config_cuda in an agnostic way
-new_empty_repository(
-    name = "cuda",
-    build_file = "//third_party:cuda.BUILD",
+new_local_repository(
+    name = "cudnn",
+    build_file = "@//third_party:cudnn.BUILD",
+    path = "/usr/",
 )
--- a/third_party/cuda.BUILD
+++ b/third_party/cuda.BUILD
@ -1,43 +1,76 @@
-"""
-Collect all the CUDA stuff from @local_config_cuda in a single target
-for convenience.
-"""
+# Adopted from: https://github.com/tensorflow/runtime/blob/master/third_party/rules_cuda/private/BUILD.local_cuda
+# Library targets are created corresponding to BUILD.bazel's needs.
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = glob([
+        "include/**",
+        "targets/x86_64-linux/include/**",
+    ]),
+    includes = [
+        "include",
+        "targets/x86_64-linux/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["lib64/stubs/libcuda.so"],
+    visibility = ["//visibility:public"],
+)

 cc_library(
    name = "cuda",
+    srcs = ["targets/x86_64-linux/lib/libcudart.so"],
    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cublas",
-        "@local_config_cuda//cuda:cuda_driver",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_cuda//cuda:cudart",
-        "@local_config_cuda//cuda:cufft",
-        "@local_config_cuda//cuda:curand",
-    ],
+    deps = [":cuda_headers"],
 )

 cc_library(
-    name = "cupti",
-    deps = [
-        "@local_config_cuda//cuda:cupti_headers",
-        "@local_config_cuda//cuda:cupti_link",
-    ],
+    name = "cufft",
+    srcs = ["targets/x86_64-linux/lib/libcufft.so"],
+    visibility = ["//visibility:public"],
 )

-[
-    alias(
-        name = lib,
-        actual = "@local_config_cuda//cuda:{}".format(lib),
-        visibility = ["//visibility:public"],
-    )
-    for lib in [
-        "cublas",
-        "cufft",
-        "cusolver",
-        "cusparse",
-        "curand",
-        "nvrtc",
-        "cuda_driver",
-        "nvToolsExt",
-    ]
-]
+cc_library(
+    name = "cublas",
+    srcs = [
+        "targets/x86_64-linux/lib/libcublasLt.so",
+        "targets/x86_64-linux/lib/libcublas.so",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["targets/x86_64-linux/lib/libcurand.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["targets/x86_64-linux/lib/libcusolver.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusparse",
+    srcs = ["targets/x86_64-linux/lib/libcusparse.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nvrtc",
+    srcs = [
+        "targets/x86_64-linux/lib/libnvrtc.so",
+        "targets/x86_64-linux/lib/libnvrtc-builtins.so",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nvToolsExt",
+    srcs = [ "lib64/libnvToolsExt.so"],
+    visibility = ["//visibility:public"],
+)
--- a/third_party/cudnn.BUILD
+++ b/third_party/cudnn.BUILD
@ -0,0 +1,26 @@
+# Adopted from: https://github.com/NVIDIA/TRTorch/blob/master/third_party/cudnn/local/BUILD
+
+cc_library(
+    name = "cudnn_headers",
+    hdrs = ["include/cudnn.h"] + glob([
+        "include/cudnn+.h",
+        "include/cudnn_*.h",
+    ]),
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_import(
+    name = "cudnn_lib",
+    shared_library = "lib/x86_64-linux-gnu/libcudnn.so",
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "cudnn",
+    visibility = ["//visibility:public"],
+    deps = [
+        "cudnn_headers",
+        "cudnn_lib",
+    ],
+)
--- a/third_party/gloo.BUILD
+++ b/third_party/gloo.BUILD
@ -48,8 +48,8 @@ cc_library(
 cu_library(
    name = "gloo_cuda",
    srcs = [
-        "gloo/cuda.cu.cc",
-        "gloo/cuda_private.cu.cc",
+        "gloo/cuda.cu",
+        "gloo/cuda_private.cu",
    ],
    visibility = ["//visibility:public"],
    deps = [
@ -72,8 +72,8 @@ cc_library(
            "gloo/cuda*.cc",
            "gloo/common/win.cc",
            "gloo/rendezvous/redis_store.cc",
-        ],
-    ),
+        ]
+    ) + if_cuda(glob(["gloo/cuda*.cc"])),
    copts = [
        "-std=gnu++11",
        "-std=c++11",
--- a/third_party/substitution.bzl
+++ b/third_party/substitution.bzl
@ -58,7 +58,7 @@ def header_template_rule_impl(ctx):
        CcInfo(compilation_context = cc_common.create_compilation_context(

            # pass out the include path for finding this header
-            includes = depset([ctx.outputs.out.dirname, ctx.bin_dir.path]),
+            system_includes = depset([ctx.attr.include, ctx.outputs.out.dirname, ctx.bin_dir.path]),

            # and the actual header here.
            headers = depset([ctx.outputs.out]),
@ -68,6 +68,7 @@ def header_template_rule_impl(ctx):
 header_template_rule = rule(
    attrs = {
        "out": attr.output(mandatory = True),
+        "include": attr.string(),
        "src": attr.label(
            mandatory = True,
            allow_single_file = True,
--- a/third_party/tensorflow_cuda_bazel_build/BUILD
+++ b/third_party/tensorflow_cuda_bazel_build/BUILD
--- a/third_party/tensorflow_cuda_bazel_build/README.md
+++ b/third_party/tensorflow_cuda_bazel_build/README.md
@ -1,5 +0,0 @@
-# Config for CUDA
-
-This is a checked-in copy of the auto-generated config for building CUDA code with bazel. The content of this folder was generated from https://github.com/tensorflow/tensorflow `./configure` execution and then edited manually to fit the pytorch needs.
-
-The LICENSE for the TensorFlow project is APACHE 2. The full LICENSE file could be found here https://github.com/tensorflow/tensorflow/blob/master/LICENSE.
--- a/third_party/tensorflow_cuda_bazel_build/WORKSPACE
+++ b/third_party/tensorflow_cuda_bazel_build/WORKSPACE
@ -1 +0,0 @@
-workspace(name = "local_config_cuda")
--- a/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
+++ b/third_party/tensorflow_cuda_bazel_build/cuda/BUILD
@ -1,451 +0,0 @@
-licenses([
-    "restricted",
-    "reciprocal",
-    "notice",
-])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-cc_library(
-    name = "cuda_headers",
-    hdrs = [
-        ":cuda-include",
-        ":cudnn-include",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-)
-
-cc_library(
-    name = "cudnn_headers",
-    hdrs = [
-        ":cudnn-include",
-    ],
-    includes = [
-        ".",
-        "include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    linkopts = [
-        "-L/usr/local/cuda/lib64",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    linkopts = ["-lcuda"],
-    deps = [":linker_search_path"],
-)
-
-# Provides the RPATH for Nvidia-less sytems to be able to run binaries linked to libcuda.
-cc_library(
-    name = "driver_stub_runtime",
-    linkopts = [
-        "-Wl,-rpath,/usr/local/cuda/lib64/stubs",
-    ],
-    deps = [":cuda_driver"],
-)
-
-cc_library(
-    name = "linker_search_path",
-    linkopts = [
-        "-L/usr/local/cuda/lib64",
-        "-L/usr/local/cuda/lib64/stubs",
-        "-Wl,-rpath-link,/usr/local/cuda/lib64",
-        "-Wl,-rpath-link,/usr/local/cuda/lib64/stubs",
-    ],
-)
-
-[
-    cc_library(
-        name = libname,
-        linkopts = ["-l" + libname] + (["-lgomp"] if (libname == "cusolver") else []),
-        linkstatic = True,
-        deps = [":linker_search_path"],
-    )
-    for libname in [
-        "cublas",
-        "cudart",
-        "cudnn",
-        "cufft",
-        "curand",
-        "cusolver",
-        "cusparse",
-        "nvrtc",
-        "nvToolsExt",
-    ]
-]
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-        ":nvToolsExt",
-    ],
-)
-
-# NVIDIA Performance Primitives (http://docs.nvidia.com/cuda/npp/modules.html))
-# used by OpenCV
-cc_library(
-    name = "nppi",
-    linkopts = [
-        "-lnppc",
-        "-lnppial",
-        "-lnppicom",
-        "-lnppidei",
-        "-lnppif",
-        "-lnppig",
-        "-lnppim",
-        "-lnppist",
-        "-lnppitc",
-        "-lnpps",
-    ],
-    linkstatic = True,
-    deps = [":linker_search_path"],
-)
-
-# NVIDIA Management Library
-cc_library(
-    name = "nvml",
-    linkopts = [
-        "-lnvidia-ml",
-        "-Wl,-rpath,/usr/lib/nvidia-410",
-        "-Wl,-rpath,/usr/lib/nvidia-390",
-        "-Wl,-rpath,/usr/lib/nvidia-387",
-        "-Wl,-rpath,/usr/lib/nvidia-384",
-    ],
-    deps = [":linker_search_path"],
-)
-
-cc_library(
-    name = "cupti_headers",
-    hdrs = [
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "extras/CUPTI/include/",
-    ],
-)
-
-# cupti .so exposed at linktime
-cc_library(
-    name = "cupti_link",
-    linkopts = [
-        "-L/usr/local/cuda/extras/CUPTI/lib64",
-        "-lcupti",
-    ],
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-CUDA_INCLUDES_FILES = [
-    "include/builtin_types.h",
-    "include/channel_descriptor.h",
-    "include/CL/cl_egl.h",
-    "include/CL/cl_ext.h",
-    "include/CL/cl_gl_ext.h",
-    "include/CL/cl_gl.h",
-    "include/CL/cl.h",
-    "include/CL/cl.hpp",
-    "include/CL/cl_platform.h",
-    "include/CL/opencl.h",
-    "include/common_functions.h",
-    "include/cooperative_groups.h",
-    "include/cooperative_groups_helpers.h",
-    "include/crt/common_functions.h",
-    "include/crt/device_double_functions.h",
-    "include/crt/device_double_functions.hpp",
-    "include/crt/device_functions.h",
-    "include/crt/device_functions.hpp",
-    "include/crt/func_macro.h",
-    "include/crt/host_config.h",
-    "include/crt/host_defines.h",
-    "include/crt/host_runtime.h",
-    "include/crt/math_functions.h",
-    "include/crt/math_functions.hpp",
-    "include/crt/mma.h",
-    "include/crt/mma.hpp",
-    "include/crt/nvfunctional",
-    "include/crt/sm_70_rt.h",
-    "include/crt/sm_70_rt.hpp",
-    "include/crt/storage_class.h",
-    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
-    # "include/cublas_api.h",
-    # "include/cublas.h",
-    # "include/cublas_v2.h",
-    # "include/cublasXt.h",
-    "include/cuComplex.h",
-    "include/cuda_device_runtime_api.h",
-    "include/cudaEGL.h",
-    "include/cuda_egl_interop.h",
-    "include/cuda_fp16.h",
-    "include/cuda_fp16.hpp",
-    "include/cudaGL.h",
-    "include/cuda_gl_interop.h",
-    "include/cuda.h",
-    "include/cudalibxt.h",
-    "include/cuda_occupancy.h",
-    "include/cuda_profiler_api.h",
-    "include/cudaProfiler.h",
-    "include/cudart_platform.h",
-    "include/cuda_runtime_api.h",
-    "include/cuda_runtime.h",
-    "include/cuda_surface_types.h",
-    "include/cuda_texture_types.h",
-    "include/cudaVDPAU.h",
-    "include/cuda_vdpau_interop.h",
-    "include/cufft.h",
-    "include/cufftw.h",
-    "include/cufftXt.h",
-    "include/curand_discrete2.h",
-    "include/curand_discrete.h",
-    "include/curand_globals.h",
-    "include/curand.h",
-    "include/curand_kernel.h",
-    "include/curand_lognormal.h",
-    "include/curand_mrg32k3a.h",
-    "include/curand_mtgp32dc_p_11213.h",
-    "include/curand_mtgp32.h",
-    "include/curand_mtgp32_host.h",
-    "include/curand_mtgp32_kernel.h",
-    "include/curand_normal.h",
-    "include/curand_normal_static.h",
-    "include/curand_philox4x32_x.h",
-    "include/curand_poisson.h",
-    "include/curand_precalc.h",
-    "include/curand_uniform.h",
-    "include/cusolver_common.h",
-    "include/cusolverDn.h",
-    "include/cusolverRf.h",
-    "include/cusolverSp.h",
-    "include/cusolverSp_LOWLEVEL_PREVIEW.h",
-    "include/cusparse.h",
-    "include/cusparse_v2.h",
-    "include/device_atomic_functions.h",
-    "include/device_atomic_functions.hpp",
-    "include/device_double_functions.h",
-    "include/device_functions.h",
-    "include/device_launch_parameters.h",
-    "include/device_types.h",
-    "include/driver_functions.h",
-    "include/driver_types.h",
-    "include/fatBinaryCtl.h",
-    "include/fatbinary.h",
-    "include/host_config.h",
-    "include/host_defines.h",
-    "include/library_types.h",
-    "include/math_constants.h",
-    "include/math_functions.h",
-    "include/mma.h",
-    "include/nppcore.h",
-    "include/nppdefs.h",
-    "include/npp.h",
-    "include/nppi_arithmetic_and_logical_operations.h",
-    "include/nppi_color_conversion.h",
-    "include/nppi_compression_functions.h",
-    "include/nppi_computer_vision.h",
-    "include/nppi_data_exchange_and_initialization.h",
-    "include/nppi_filtering_functions.h",
-    "include/nppi_geometry_transforms.h",
-    "include/nppi.h",
-    "include/nppi_linear_transforms.h",
-    "include/nppi_morphological_operations.h",
-    "include/nppi_statistics_functions.h",
-    "include/nppi_support_functions.h",
-    "include/nppi_threshold_and_compare_operations.h",
-    "include/npps_arithmetic_and_logical_operations.h",
-    "include/npps_conversion_functions.h",
-    "include/npps_filtering_functions.h",
-    "include/npps.h",
-    "include/npps_initialization.h",
-    "include/npps_statistics_functions.h",
-    "include/npps_support_functions.h",
-    # Note: CUDA 10.0 only
-    # "include/nppversion.h",
-    # TODO: figure out why on a CI machine with CUDA 10.2 it's not present
-    # "include/nvblas.h",
-    "include/nvfunctional",
-    "include/nvgraph.h",
-    "include/nvjpeg.h",
-    "include/nvml.h",
-    "include/nvrtc.h",
-    "include/nvToolsExtCuda.h",
-    "include/nvToolsExtCudaRt.h",
-    "include/nvToolsExt.h",
-    "include/nvToolsExtMeta.h",
-    "include/nvToolsExtSync.h",
-    "include/nvtx3/nvToolsExtCuda.h",
-    "include/nvtx3/nvToolsExtCudaRt.h",
-    "include/nvtx3/nvToolsExt.h",
-    "include/nvtx3/nvToolsExtOpenCL.h",
-    "include/nvtx3/nvToolsExtSync.h",
-    "include/nvtx3/nvtxDetail/nvtxImplCore.h",
-    "include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxImpl.h",
-    "include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-    "include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-    "include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-    "include/nvtx3/nvtxDetail/nvtxInit.h",
-    "include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-    "include/nvtx3/nvtxDetail/nvtxTypes.h",
-    "include/sm_20_atomic_functions.h",
-    "include/sm_20_atomic_functions.hpp",
-    "include/sm_20_intrinsics.h",
-    "include/sm_20_intrinsics.hpp",
-    "include/sm_30_intrinsics.h",
-    "include/sm_30_intrinsics.hpp",
-    "include/sm_32_atomic_functions.h",
-    "include/sm_32_atomic_functions.hpp",
-    "include/sm_32_intrinsics.h",
-    "include/sm_32_intrinsics.hpp",
-    "include/sm_35_atomic_functions.h",
-    "include/sm_35_intrinsics.h",
-    "include/sm_60_atomic_functions.h",
-    "include/sm_60_atomic_functions.hpp",
-    "include/sm_61_intrinsics.h",
-    "include/sm_61_intrinsics.hpp",
-    # CUDA 10.0 only
-    # "include/sobol_direction_vectors.h",
-    "include/surface_functions.h",
-    "include/surface_functions.hpp",
-    "include/surface_indirect_functions.h",
-    "include/surface_indirect_functions.hpp",
-    "include/surface_types.h",
-    "include/texture_fetch_functions.h",
-    "include/texture_fetch_functions.hpp",
-    "include/texture_indirect_functions.h",
-    "include/texture_indirect_functions.hpp",
-    "include/texture_types.h",
-    "include/vector_functions.h",
-    "include/vector_functions.hpp",
-    "include/vector_types.h",
-]
-
-genrule(
-    name = "cuda-include",
-    outs = CUDA_INCLUDES_FILES,
-    cmd = " && ".join([
-        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
-        for p in CUDA_INCLUDES_FILES
-    ]),
-    local = True,
-    tags = ["no-cache"],
-)
-
-CUDA_NVVM_FILES = [
-    "nvvm/bin/cicc",
-    "nvvm/include/nvvm.h",
-    "nvvm/lib64/libnvvm.so",
-    "nvvm/lib64/libnvvm.so.3",
-    "nvvm/lib64/libnvvm.so.3.3.0",
-    "nvvm/libdevice/libdevice.10.bc",
-]
-
-genrule(
-    name = "cuda-nvvm",
-    outs = CUDA_NVVM_FILES,
-    cmd = " && ".join([
-        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
-        for p in CUDA_NVVM_FILES
-    ]),
-    local = True,
-    tags = ["no-cache"],
-)
-
-CUDA_EXTRAS_FILES = [
-    "extras/CUPTI/include/cuda_stdint.h",
-    "extras/CUPTI/include/cupti.h",
-    "extras/CUPTI/include/cupti_activity.h",
-    "extras/CUPTI/include/cupti_callbacks.h",
-    "extras/CUPTI/include/cupti_driver_cbid.h",
-    "extras/CUPTI/include/cupti_events.h",
-    "extras/CUPTI/include/cupti_metrics.h",
-    "extras/CUPTI/include/cupti_nvtx_cbid.h",
-    "extras/CUPTI/include/cupti_result.h",
-    "extras/CUPTI/include/cupti_runtime_cbid.h",
-    "extras/CUPTI/include/cupti_version.h",
-    "extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-    "extras/CUPTI/include/generated_cuda_meta.h",
-    "extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-    "extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-    "extras/CUPTI/include/generated_cudaGL_meta.h",
-    "extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-    "extras/CUPTI/include/generated_nvtx_meta.h",
-    "extras/CUPTI/include/GL/gl.h",
-    "extras/CUPTI/include/GL/glew.h",
-    "extras/CUPTI/include/GL/glext.h",
-    "extras/CUPTI/include/GL/glu.h",
-    "extras/CUPTI/include/GL/glut.h",
-    "extras/CUPTI/include/GL/glx.h",
-    "extras/CUPTI/include/GL/glxext.h",
-    "extras/CUPTI/include/GL/wglew.h",
-    "extras/CUPTI/include/GL/wglext.h",
-    "extras/CUPTI/include/openacc/cupti_openacc.h",
-]
-
-genrule(
-    name = "cuda-extras",
-    outs = CUDA_EXTRAS_FILES,
-    cmd = " && ".join([
-        "ln -s /usr/local/cuda/{relpath} $(@D)/{relpath}".format(relpath = p)
-        for p in CUDA_EXTRAS_FILES
-    ]),
-    local = True,
-    tags = ["no-cache"],
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "include/cudnn.h",
-    ],
-    cmd = """
-        ln -s /usr/include/cudnn.h $(@D)/cudnn.h""",
-    local = True,
-    tags = ["no-cache"],
-)
-
--- a/third_party/tensorpipe.BUILD
+++ b/third_party/tensorpipe.BUILD
@ -162,8 +162,8 @@ cc_library(

 cc_library(
    name = "tensorpipe_cuda",
-    srcs = TENSORPIPE_CUDA_SOURCES,
-    hdrs = TENSORPIPE_CUDA_HEADERS + [":tensorpipe_cuda_config_header"],
+    srcs = glob(TENSORPIPE_CUDA_SOURCES),
+    hdrs = glob(TENSORPIPE_CUDA_HEADERS) + [":tensorpipe_cuda_config_header"],
    includes = [
        ".",
    ],
--- a/tools/rules/cu.bzl
+++ b/tools/rules/cu.bzl
@ -1,3 +1,6 @@
-# gpu support is not available
-def cu_library(**kwargs):
-    pass
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+
+NVCC_COPTS = ["--expt-relaxed-constexpr", "--expt-extended-lambda"]
+
+def cu_library(name, srcs, copts = [], **kwargs):
+    cuda_library(name, srcs = srcs, copts = NVCC_COPTS + copts, **kwargs)