Update base for Update on "(2/2) Make TorchScript Preserve Fully Qualified Class Name for Python Exceptions: frontend change"

Reland D33282878. This is the frontend change. Differential Revision: [D33342569](https://our.internmc.facebook.com/intern/diff/D33342569/) **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D33342569/)! [ghstack-poisoned]
2025-12-06 12:20:52 +01:00 · 2022-02-15 12:13:40 -08:00 · 2022-02-15 12:13:40 -08:00 · 36b39d6692
commit 36b39d6692
parent ef4d566caa cb00d9601c
336 changed files with 18536 additions and 20565 deletions
--- a/.github/merge_rules.json
+++ b/.github/merge_rules.json
@ -14,7 +14,13 @@
   {
    "name": "OSS CI",
    "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**"],
-    "approved_by": ["seemethere", "malfet", "suo"],
+    "approved_by": ["seemethere", "malfet", "suo", "janeyx99"],
+    "mandatory_app_id": 12274
+   },
+   {
+      "name": "Documentation",
+      "patterns": ["docs/**", "torch/*docs.py"],
+      "approved_by": ["mruberry", "ngimel", "albanD", "janeyx99"],
      "mandatory_app_id": 12274
   }
 ]
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -1,4 +1,5 @@
 {% import 'common.yml.j2' as common %}
+{% import 'upload.yml.j2' as upload %}

 {%- block name -%}
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
@ -6,24 +7,6 @@
 name: !{{ build_environment }}
 {%- endblock %}

-{%- macro binary_env(config) -%}
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: !{{ config["package_type"] }}
-      SKIP_ALL_TESTS: 1
-      DESIRED_CUDA: cpu
-{%- if config["package_type"] == "libtorch" %}
-      LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
-      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-{%- else %}
-      DESIRED_PYTHON: "!{{ config["python_version"] }}"
-{%- endif %}
-{%- endmacro %}
-
 {%- macro set_runner_specific_vars() -%}
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -83,7 +66,7 @@ jobs:
 {%- else %}
    timeout-minutes: !{{ common.timeout_minutes }}
 {%- endif %}
-    !{{ binary_env(config) }}
+    !{{ upload.binary_env(config, true) }}
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -96,16 +79,8 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -129,53 +104,5 @@ jobs:
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  !{{ config["build_name"] }}-upload:  # Uploading
-    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: !{{ config["build_name"] }}-build
-    !{{ binary_env(config) }}
-    steps:
-      !{{ common.setup_ec2_linux() }}
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-      - uses: actions/download-artifact@v2
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Upload binaries
-        env:
-          PKG_DIR: "${{ runner.temp }}/artifacts"
-          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }}
-          ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          docker run --rm -i \
-            -e ANACONDA_API_TOKEN \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e DRY_RUN \
-            -e PACKAGE_TYPE \
-            -e PKG_DIR=/artifacts \
-            -e UPLOAD_CHANNEL \
-            -e UPLOAD_SUBFOLDER \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -v "${GITHUB_WORKSPACE}:/v" \
-            -w /v \
-            308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \
-            bash -c '.circleci/scripts/binary_upload.sh'
-      !{{ common.teardown_ec2_linux() }}
+  !{{ upload.upload_binaries(config, has_test=False, use_s3=False) }}
 {%- endfor %}
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -32,17 +32,25 @@
 {%- endmacro %}


-{%- macro upload_binaries(config, is_windows=False) -%}
+{%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%}
 !{{ config["build_name"] }}-upload:  # Uploading
    runs-on: linux.2xlarge  # self hosted runner to download ec2 artifacts
    if: ${{ github.repository_owner == 'pytorch' }}
+{%- if has_test %}
    needs: !{{ config["build_name"] }}-test
+{%- else %}
+    needs: !{{ config["build_name"] }}-build
+{%- endif %}
    !{{ binary_env(config, is_windows) }}
    steps:
      !{{ common.setup_ec2_linux() }}
      - name: Clone pytorch/pytorch
        uses: actions/checkout@v2
+{%- if use_s3 %}
      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+{%- else %}
+      - uses: actions/download-artifact@v2
+{%- endif %}
        name: Download Build Artifacts
        with:
          name: !{{ config["build_name"] }}
--- a/.github/workflows/generated-macos-arm64-binary-conda.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda.yml
@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: macos-arm64-binary-conda
@ -43,8 +44,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -69,16 +73,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -107,11 +122,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_8-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
    steps:
      - name: Display EC2 information
@ -165,11 +182,11 @@ jobs:
          name: conda-py3_8-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -222,8 +239,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -248,16 +268,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -286,11 +317,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_9-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
@ -344,11 +377,11 @@ jobs:
          name: conda-py3_9-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -401,8 +434,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -427,16 +463,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -465,11 +512,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_10-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
@ -523,11 +572,11 @@ jobs:
          name: conda-py3_10-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
--- a/.github/workflows/generated-macos-arm64-binary-wheel.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel.yml
@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: macos-arm64-binary-wheel
@ -43,8 +44,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.7"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -69,16 +73,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -107,11 +122,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_7-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
@ -165,11 +182,11 @@ jobs:
          name: wheel-py3_7-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -222,8 +239,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -248,16 +268,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -286,11 +317,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_8-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
    steps:
      - name: Display EC2 information
@ -344,11 +377,11 @@ jobs:
          name: wheel-py3_8-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -401,8 +434,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -427,16 +463,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -465,11 +512,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_9-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
@ -523,11 +572,11 @@ jobs:
          name: wheel-py3_9-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -580,8 +629,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -606,16 +658,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -644,11 +707,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_10-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
@ -702,11 +767,11 @@ jobs:
          name: wheel-py3_10-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
--- a/.github/workflows/generated-macos-binary-conda.yml
+++ b/.github/workflows/generated-macos-binary-conda.yml
@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: macos-binary-conda
@ -41,8 +42,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.7"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -67,16 +71,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -105,11 +120,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_7-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
@ -163,11 +180,11 @@ jobs:
          name: conda-py3_7-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -220,8 +237,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -246,16 +266,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -284,11 +315,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_8-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
    steps:
      - name: Display EC2 information
@ -342,11 +375,11 @@ jobs:
          name: conda-py3_8-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -399,8 +432,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -425,16 +461,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -463,11 +510,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_9-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
@ -521,11 +570,11 @@ jobs:
          name: conda-py3_9-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -578,8 +627,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -604,16 +656,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -642,11 +705,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: conda-py3_10-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: conda
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
@ -700,11 +765,11 @@ jobs:
          name: conda-py3_10-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml
@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: macos-binary-libtorch-cxx11-abi
@ -42,8 +43,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -72,16 +76,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -110,16 +125,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -172,11 +186,11 @@ jobs:
          name: libtorch-cpu-shared-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -230,8 +244,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -260,16 +277,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -298,16 +326,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-shared-without-deps-cxx11-abi-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -360,11 +387,11 @@ jobs:
          name: libtorch-cpu-shared-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -418,8 +445,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -448,16 +478,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -486,16 +527,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-static-with-deps-cxx11-abi-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -548,11 +588,11 @@ jobs:
          name: libtorch-cpu-static-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -606,8 +646,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -636,16 +679,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -674,16 +728,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-static-without-deps-cxx11-abi-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -736,11 +789,11 @@ jobs:
          name: libtorch-cpu-static-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml
@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: macos-binary-libtorch-pre-cxx11
@ -42,8 +43,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -72,16 +76,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -110,16 +125,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -172,11 +186,11 @@ jobs:
          name: libtorch-cpu-shared-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -230,8 +244,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -260,16 +277,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -298,16 +326,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-shared-without-deps-pre-cxx11-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: shared-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -360,11 +387,11 @@ jobs:
          name: libtorch-cpu-shared-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -418,8 +445,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -448,16 +478,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -486,16 +527,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-static-with-deps-pre-cxx11-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -548,11 +588,11 @@ jobs:
          name: libtorch-cpu-static-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -606,8 +646,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
      # This is a dummy value for libtorch to work correctly with our batch scripts
@ -636,16 +679,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -674,16 +728,15 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: libtorch-cpu-static-without-deps-pre-cxx11-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: libtorch
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      LIBTORCH_VARIANT: static-without-deps
      DESIRED_DEVTOOLSET: pre-cxx11
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
        shell: bash
@ -736,11 +789,11 @@ jobs:
          name: libtorch-cpu-static-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
--- a/.github/workflows/generated-macos-binary-wheel.yml
+++ b/.github/workflows/generated-macos-binary-wheel.yml
@ -1,4 +1,5 @@
 # @generated DO NOT EDIT MANUALLY
+
 # Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: macos-binary-wheel
@ -41,8 +42,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.7"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -67,16 +71,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -105,11 +120,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_7-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.7"
    steps:
      - name: Display EC2 information
@ -163,11 +180,11 @@ jobs:
          name: wheel-py3_7-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -220,8 +237,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -246,16 +266,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -284,11 +315,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_8-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
    steps:
      - name: Display EC2 information
@ -342,11 +375,11 @@ jobs:
          name: wheel-py3_8-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -399,8 +432,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -425,16 +461,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -463,11 +510,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_9-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Display EC2 information
@ -521,11 +570,11 @@ jobs:
          name: wheel-py3_9-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
@ -578,8 +627,11 @@ jobs:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@ -604,16 +656,27 @@ jobs:
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: ${{ env.PYTORCH_ROOT }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          submodules: recursive
          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
@ -642,11 +705,13 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: wheel-py3_10-cpu-build
    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
      PACKAGE_TYPE: wheel
-      SKIP_ALL_TESTS: 1
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
@ -700,11 +765,11 @@ jobs:
          name: wheel-py3_10-cpu
          path: "${{ runner.temp }}/artifacts/"
      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }}
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}}
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        run: |
          # reference ends with an RC suffix
          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@ -157,4 +157,5 @@ python setup.py install --cmake && sccache --show-stats && (

 sccache --show-stats > stats.txt
 python -m tools.stats.upload_sccache_stats stats.txt
+sccache --stop-server
 rm stats.txt
--- a/aten/src/ATen/ExpandBase.h
+++ b/aten/src/ATen/ExpandBase.h
@ -0,0 +1,23 @@
+#include <ATen/core/TensorBase.h>
+
+// Broadcasting utilities for working with TensorBase
+namespace at {
+namespace internal {
+TORCH_API TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size);
+} // namespace internal
+
+inline c10::MaybeOwned<TensorBase> expand_size(const TensorBase &self, IntArrayRef size) {
+  if (size.equals(self.sizes())) {
+    return c10::MaybeOwned<TensorBase>::borrowed(self);
+  }
+  return c10::MaybeOwned<TensorBase>::owned(
+      at::internal::expand_slow_path(self, size));
+}
+c10::MaybeOwned<TensorBase> expand_size(TensorBase &&self, IntArrayRef size) = delete;
+
+inline c10::MaybeOwned<TensorBase> expand_inplace(const TensorBase &tensor, const TensorBase &to_expand) {
+  return expand_size(to_expand, tensor.sizes());
+}
+c10::MaybeOwned<TensorBase> expand_inplace(const TensorBase &tensor, TensorBase &&to_expand) = delete;
+
+} // namespace at
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@ -1,8 +1,15 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ExpandUtils.h>
+#include <ATen/ExpandBase.h>

 #include <c10/util/irange.h>

 namespace at {
+namespace internal {
+TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
+  return OptionalTensorRef(self)->expand(size);
+}
+}

 namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
--- a/aten/src/ATen/Tensor.h
+++ b/aten/src/ATen/Tensor.h
@ -1,3 +1,3 @@
 #pragma once

-#include <ATen/core/TensorBody.h>
+#include <ATen/core/Tensor.h>
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@ -1,10 +1,30 @@
 #include <ATen/TensorGeometry.h>
-#include <ATen/TensorUtils.h>

-#include <ATen/ATen.h>
+#include <limits>
+#include <cstddef>

 namespace at {

+// See TensorGeometry.h on why this is useful now that we cache is_contiguous.
+bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
+  assert(sizes.size() < static_cast<std::size_t>(std::numeric_limits<std::int64_t>::max()));
+  auto dim = static_cast<std::int64_t>(sizes.size());
+  int64_t expected_stride = 1;
+  bool contig_if_nonempty = true;
+  for (int64_t i = dim - 1; i >= 0; i--) {
+    if (sizes[i] == 0) {
+      return true;
+    }
+    if (contig_if_nonempty) {
+      if (sizes[i] != 1 && strides[i] != expected_stride) {
+        contig_if_nonempty = false;
+      }
+      expected_stride *= sizes[i];
+    }
+  }
+  return contig_if_nonempty;
+}
+
 bool TensorGeometry::is_contiguous() const {
  if (numel_ == 0) {
    return true;
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@ -1,10 +1,17 @@
 #pragma once

-#include <ATen/WrapDimUtils.h>
-#include <ATen/core/Tensor.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <ATen/core/TensorBase.h>

 namespace at {

+// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
+// Although we cache is_contiguous in tensor now, this is till useful because it
+// allows checking if a particular geometry is contiguous without explicitly
+// constructing a tensor, e.g., when you want to choose a kernel strategy based
+// on whether a subgeometry is contiguous.
+TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
+
 struct TORCH_API TensorGeometry {
  TensorGeometry() : storage_offset_(0) {}

@ -21,7 +28,7 @@ struct TORCH_API TensorGeometry {
      numel_ = expected_stride;
  }

-  explicit TensorGeometry(const Tensor& t)
+  explicit TensorGeometry(const TensorBase& t)
    : sizes_(t.sizes().vec())
    , strides_(t.strides().vec())
    , storage_offset_(t.storage_offset())
@ -32,12 +39,12 @@ struct TORCH_API TensorGeometry {

  int64_t dim() const { return sizes_.size(); }
  int64_t size(int64_t dim) const {
-    dim = maybe_wrap_dim(dim, this->dim());
+    dim = c10::maybe_wrap_dim(dim, this->dim());
    return sizes_.at(static_cast<size_t>(dim));
  }
  IntArrayRef sizes() const { return IntArrayRef{ sizes_ }; }
  int64_t stride(int64_t dim) const {
-    dim = maybe_wrap_dim(dim, this->dim());
+    dim = c10::maybe_wrap_dim(dim, this->dim());
    return strides_.at(static_cast<size_t>(dim));
  }
  IntArrayRef strides() const { return IntArrayRef{ strides_ }; }
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@ -28,7 +28,8 @@ constexpr auto kFunctorchWrappedTensors = DispatchKeySet({

 constexpr auto kTensorSubclassLike = kFunctorchWrappedTensors | DispatchKeySet({
    DispatchKey::Batched,
-    DispatchKey::Sparse,
+    DispatchKey::SparseCPU,
+    DispatchKey::SparseCUDA,
    DispatchKey::SparseCsrCPU,
    DispatchKey::SparseCsrCUDA,
    DispatchKey::Meta,
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -264,25 +264,6 @@ void * maybe_data_ptr(const TensorArg& tensor) {
  return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
 }

-// See TensorUtils.h on why this is useful now that we cache is_contiguous.
-bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
-  int64_t dim = sizes.size();
-  int64_t expected_stride = 1;
-  bool contig_if_nonempty = true;
-  for (int64_t i = dim - 1; i >= 0; i--) {
-    if (sizes[i] == 0) {
-      return true;
-    }
-    if (contig_if_nonempty) {
-      if (sizes[i] != 1 && strides[i] != expected_stride) {
-        contig_if_nonempty = false;
-      }
-      expected_stride *= sizes[i];
-    }
-  }
-  return contig_if_nonempty;
-}
-
 void check_dim_size(
    const Tensor& tensor,
    int64_t dim,
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@ -138,13 +138,6 @@ TORCH_API void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layo
 TORCH_API void* maybe_data_ptr(const Tensor& tensor);
 TORCH_API void* maybe_data_ptr(const TensorArg& tensor);

-// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
-// Although we cache is_contiguous in tensor now, this is till useful because it
-// allows checking if a particular geometry is contiguous without explicitly
-// constructing a tensor, e.g., when you want to choose a kernel strategy based
-// on whether a subgeometry is contiguous.
-TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
-
 TORCH_API void check_dim_size(
    const Tensor& tensor,
    int64_t dim,
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -91,29 +91,6 @@ std::array<int64_t, N> check_intlist(ArrayRef<int64_t> list, const char * name,
  return res;
 }

-/**
- * Utility function to static cast input Generator* to
- * the backend generator type (CPU/CUDAGeneratorImpl etc.)
- */
-template <typename T>
-static inline T * check_generator(c10::optional<Generator> gen) {
-  TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
-  TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
-  TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
-  return gen->get<T>();
-}
-
-/**
- * Utility function used in tensor implementations, which
- * supplies the default generator to tensors, if an input generator
- * is not supplied. The input Generator* is also static casted to
- * the backend generator type (CPU/CUDAGeneratorImpl etc.)
- */
-template <typename T>
-static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
-  return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
-}
-
 using at::detail::check_size_nonnegative;

 namespace detail {
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@ -138,6 +138,29 @@ Generator make_generator(Args&&... args) {
  return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
 }

+/**
+ * Utility function to static cast input Generator* to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+static inline T * check_generator(c10::optional<Generator> gen) {
+  TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
+  TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
+  TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
+  return gen->get<T>();
+}
+
+/**
+ * Utility function used in tensor implementations, which
+ * supplies the default generator to tensors, if an input generator
+ * is not supplied. The input Generator* is also static casted to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
+  return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
+}
+
 namespace detail {

 /**
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -4,7 +4,14 @@

 namespace {

+// TLS saving the state of the include/exclude sets on entry to the dispatcher
+// This is set in the pythonTLSSnapshot fallback and used by the Python fallback.
+thread_local c10::optional<c10::impl::LocalDispatchKeySet> tls_on_entry;
+
 void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
+  c10::impl::ForceDispatchKeyGuard guard(tls_on_entry.value());
+
  // If Python Mode is active, use its PyInterpreter for dispatch
  const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state();
  if (maybe_python_mode_state) {
@ -42,8 +49,25 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }

+void pythonTLSSnapshotFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+  // It is ok for the tls to be already set here.
+  // A CompositeImplicitAutograd function may have been called just before this and so the tls here were never cleared
+  // This is also why we don't need an RAII to ensure the tls is reset when exceptions happen
+
+  tls_on_entry = c10::impl::tls_local_dispatch_key_set();
+
+  op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack);
+
+  tls_on_entry = c10::nullopt;
+}
+
+
 } // anonymous namespace

 TORCH_LIBRARY_IMPL(_, Python, m) {
  m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
 }
+
+TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>());
+}
--- a/aten/src/ATen/core/PythonModeTLS.cpp
+++ b/aten/src/ATen/core/PythonModeTLS.cpp
@ -8,6 +8,7 @@ void PythonModeTLS::set_state(const std::shared_ptr<TorchDispatchTypeObject>& st
  pythonModeState = state;
  if (state) {
    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true);
  } else {
    PythonModeTLS::reset_state();
  }
@ -20,6 +21,7 @@ const std::shared_ptr<TorchDispatchTypeObject>& PythonModeTLS::get_state() {
 void PythonModeTLS::reset_state() {
  pythonModeState.reset((TorchDispatchTypeObject*)nullptr);
  c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
+  c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false);
 }

 } // namespace impl
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -4,6 +4,15 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/FunctionalTensorWrapper.h>

+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/MethodOperators.h>
+#else
+#include <ATen/ops/contiguous_ops.h>
+#include <ATen/ops/fill_ops.h>
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/zero_ops.h>
+#endif
+
 #include <iostream>

 namespace at {
@ -29,6 +38,18 @@ const TensorBase& TensorBase::zero_() const {
  return *this;
 }

+TensorBase TensorBase::to(
+    at::TensorOptions options,
+    bool non_blocking,
+    bool copy,
+    c10::optional<at::MemoryFormat> memory_format) const {
+  Tensor self(*this);
+  return at::_ops::to_dtype_layout::call(
+      self, optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(), options.device_opt(),
+      options.pinned_memory_opt(), non_blocking, copy, memory_format);
+}
+
 void TensorBase::enforce_invariants() {
  if (impl_.get() == nullptr) {
    throw std::runtime_error("TensorImpl with nullptr is not supported");
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -1,6 +1,7 @@
 #pragma once

 #include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
 #include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -141,6 +141,8 @@ class TORCH_API TensorBase {
  const TensorBase& fill_(const c10::Scalar& scalar) const;
  const TensorBase& zero_() const;

+  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+
  bool is_complex() const {
    return at::isComplexType(this->scalar_type());
  }
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@ -6,52 +6,11 @@
 namespace c10 {

 void DispatchKeyExtractor::setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough) {
-  // (1) update nonFallthroughKeys_
  if (has_fallthrough) {
    nonFallthroughKeys_ = nonFallthroughKeys_.remove(k);
  } else {
    nonFallthroughKeys_ = nonFallthroughKeys_.add(k);
  }
-  // (2) update nonFallthroughKeysPerBackend_
-  if (isPerBackendFunctionalityKey(toFunctionalityKey(k))) {
-    // This is a per-backend functionality key.
-    // We need to figure out what the current backend is,
-    // and only update the bitset for that backend.
-    // subtracting 1 because the first backend should have index 0 (CPU),
-    // But the enum starts with BackendComponent::InvalidBit.
-    auto backend_idx = static_cast<uint8_t>(toBackendComponent(k)) - 1;
-    TORCH_INTERNAL_ASSERT(backend_idx >= 0 && static_cast<uint8_t>(backend_idx) < nonFallthroughKeysPerBackend_.size());
-    if (has_fallthrough) {
-      nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].remove(k);
-    } else {
-      nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].add(k);
-    }
-
-    // Set requiresBitsetPerBackend_ accordingly
-    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size() - 1)) {
-      if (nonFallthroughKeysPerBackend_[i] != nonFallthroughKeysPerBackend_[i+1]) {
-        requiresBitsetPerBackend_ = true;
-        return;
-      }
-    }
-    requiresBitsetPerBackend_ = false;
-    return;
-  } else {
-    // Otherwise, if a fallthrough is set for a functionality that isn't per backend,
-    // Then we update the fallthrough bitset for EVERY backend.
-    // TODO: we could probably optimize this by only lazily updating these values
-    // the first time that we see requiresBitsetPerBackend_ = true
-    // (which should almost never happen)
-    if (has_fallthrough) {
-      for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
-        nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].remove(k);
-      }
-    } else {
-      for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
-        nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].add(k);
-      }
-    }
-  }
 }

 std::string DispatchKeyExtractor::dumpState() const {
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -156,25 +156,15 @@ public:
      }
    });
    // Keys that are fallthrough should be skipped
-    if (requiresBitsetPerBackend_) {
-      auto backend_idx = ks.getBackendIndex();
-      return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
-    } else {
    return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
  }
-  }

  template<class... Args>
  DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const {
    auto ks = detail::multi_dispatch_key_set(args...);
    // Keys that are fallthrough should be skipped
-    if (requiresBitsetPerBackend_) {
-      auto backend_idx = ks.getBackendIndex();
-      return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
-    } else {
    return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
  }
-  }

  void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough);

@ -203,12 +193,7 @@ private:

  explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
  : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse)
-  , nonFallthroughKeys_(DispatchKeySet::FULL)
-  , requiresBitsetPerBackend_(false) {
-    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
-      nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
-    }
-  }
+  , nonFallthroughKeys_(DispatchKeySet::FULL) {}

  // this is a bitset that has ones for each argument index which has to be
  // considered for dispatch. This avoids having to iterate over the stack
@ -220,14 +205,8 @@ private:
  // fallthrough
  c10::utils::bitset dispatch_arg_indices_reverse_;

-  // Set of functionality keys for which the operator does NOT have fallthrough kernel.
+  // Set of keys for which the operator does NOT have fallthrough kernel.
  DispatchKeySet nonFallthroughKeys_;
-  // Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND.
-  // This is only needed if we know that the operator has a different set of fallthroughs defined for some backends.
-  std::array<DispatchKeySet, num_backends> nonFallthroughKeysPerBackend_;
-  // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path),
-  // or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_
-  bool requiresBitsetPerBackend_;
 };

 }
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -267,15 +267,14 @@ void Dispatcher::cleanup(const OperatorHandle& op, const OperatorName& op_name)
 RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, KernelFunction kernel, std::string debug) {
  std::lock_guard<std::mutex> lock(mutex_);

-  auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
  TORCH_CHECK(
-    !backendFallbackKernels_[idx].kernel.isValid(),
+    !backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)].kernel.isValid(),
    "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ",
-    backendFallbackKernels_[idx].debug, ", new registration ", debug
+    backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)].debug, ", new registration ", debug
  );
  // NB: inferred function schema is always nullptr for fallbacks, as fallbacks
  // cannot be unobxed
-  backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
+  backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));

  for (auto& op : operators_) {
    op.op.updateFallback(*this, dispatchKey);
@ -289,8 +288,7 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker
 void Dispatcher::deregisterFallback_(DispatchKey dispatchKey) {
  std::lock_guard<std::mutex> lock(mutex_);

-  auto idx = getDispatchTableIndexForDispatchKey(dispatchKey);
-  backendFallbackKernels_[idx] = {};
+  backendFallbackKernels_[static_cast<uint8_t>(dispatchKey)] = {};

  for (auto& op : operators_) {
    op.op.updateFallback(*this, dispatchKey);
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -291,7 +291,7 @@ private:
  // Map from namespace to debug string (saying, e.g., where the library was defined)
  ska::flat_hash_map<std::string, std::string> libraries_;

-  std::array<impl::AnnotatedKernel, num_runtime_entries> backendFallbackKernels_;
+  std::array<impl::AnnotatedKernel, static_cast<uint8_t>(DispatchKey::NumDispatchKeys)> backendFallbackKernels_;

  std::unique_ptr<detail::RegistrationListenerList> listeners_;
  std::mutex mutex_;
@ -531,7 +531,8 @@ C10_DISPATCHER_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorH
  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
  auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor()
    .template getDispatchKeySetUnboxed<Args...>(args...);
-  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKeySet.highestPriorityTypeId()));
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet.highestPriorityTypeId());
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
  // By default, when there're no high-frequency or non-sampled callbacks,
  // RecordFunction is pre-sampled as a perf optimization;
@ -552,7 +553,7 @@ template<class Return, class... Args>
 inline Return Dispatcher::redispatch(const TypedOperatorHandle<Return (Args...)>& op, DispatchKeySet currentDispatchKeySet, Args... args) const {
  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
  // do not use RecordFunction on redispatch
-  const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet);
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet.highestPriorityTypeId());
  return kernel.template call<Return, Args...>(op, currentDispatchKeySet, std::forward<Args>(args)...);
 }

@ -560,7 +561,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
  const auto& entry = op.operatorDef_->op;
  auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
-  const auto& kernel = entry.lookup(dispatchKeySet);
+  const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId());
 #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
  bool pre_sampled = false;
  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
@ -592,7 +593,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
 inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const {
  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
  const auto& entry = op.operatorDef_->op;
-  const auto& kernel = entry.lookup(dispatchKeySet);
+  const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId());
  return kernel.callBoxed(op, dispatchKeySet, stack);
 }

--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -283,7 +283,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
  }

  // 3. Backend fallback
-  auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
+  auto dispatch_ix = static_cast<uint8_t>(dispatch_key);
  if (dispatcher.backendFallbackKernels_[dispatch_ix].kernel.isValid()) {
    return {dispatcher.backendFallbackKernels_[dispatch_ix], "backend fallback"};
  }
@ -299,7 +299,10 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
 // or alias keys and their associated keysets).
 // This function should be considered a private helper for updateDispatchTable_()
 void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) {
-  const auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
+  const auto dispatch_ix = c10::getDispatchTableIndexForDispatchKey(dispatch_key);
+  if (C10_UNLIKELY(dispatch_ix == -1)) {
+    return;
+  }
  dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, dispatch_key);
  dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_key, dispatchTable_[dispatch_ix].isFallthrough());
 }
@ -326,12 +329,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
  }
  // Note [Refresh Runtime Autograd entries in dispatchTable_]
  // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
-  // In theory, we should only have to check if the given runtime key has "dense" functionality,
-  // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit).
-  // However, there are some backends that should be included in this set that don't have the dense key set.
-  // E.g. DispatchKey::Meta, DispatchKey::ORT.
  if (c10::isBackendDispatchKey(dispatch_key)) {
-    DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key));
+    DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
    updateDispatchTableEntry_(dispatcher, autograd_key);
  }
 }
@ -358,9 +357,8 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
  // catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either CompositeExplicitAutograd
  // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd)
  // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet.
-  updateDispatchTable_(dispatcher, DispatchKey::Undefined);
-  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
-    updateDispatchTable_(dispatcher, k);
+  for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
+    updateDispatchTable_(dispatcher, static_cast<DispatchKey>(iter));
  }
 }

@ -373,10 +371,9 @@ void OperatorEntry::checkInvariants() const {
  for (const auto& kv : kernels_) {
    TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState());
  }
-  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
-    auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k);
-    auto idx = getDispatchTableIndexForDispatchKey(k);
-    TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[idx]),
+  for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
+    auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), static_cast<DispatchKey>(iter));
+    TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[iter]),
      "Canonical state\n~~~~~~~~~~~\n", dumpState(), "\n\n"
      "Computed table:\n~~~~~~~~~~~\n", dumpComputedTable());
  }
@ -387,8 +384,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
  str << "[";

  bool has_kernels = false;
-  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
-    auto iter = getDispatchTableIndexForDispatchKey(k);
+  for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
    if (!dispatchTable_[iter].isValid()) {
      continue;
    }
@ -447,12 +443,8 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const {
 // updateDispatchTableFull_ would update the dispatch table to be)
 std::string OperatorEntry::dumpComputedTable() const {
  std::ostringstream oss;
-  // Need to handle Undefined separately, because its a runtime key that can't be represented
-  // in a DispatchKeySet.
-  std::vector<DispatchKey> runtime_keys = {DispatchKey::Undefined};
-  for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k);
-
-  for (auto k : runtime_keys) {
+  for (uint8_t i = 0; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys); i++) {
+    auto k = static_cast<DispatchKey>(i);
    auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k);
    if (kernel_prov.first.kernel.isValid()) {
      oss << toString(k) << ": "
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@ -173,8 +173,11 @@ public:

  [[noreturn]] void reportError(DispatchKey dispatchKey) const;

-  const KernelFunction& lookup(DispatchKeySet ks) const {
-    const auto idx = ks.getDispatchTableIndexForDispatchKeySet();
+  const KernelFunction& lookup(DispatchKey k) const {
+    const auto idx = getDispatchTableIndexForDispatchKey(k);
+    if (C10_UNLIKELY(idx == -1)) {
+      reportError(k);
+    }
    const auto& kernel = dispatchTable_[idx];
    // A valid kernel *always* has a boxed kernel and *may* have an
    // unboxed kernel. However, we typically do unboxed calls in at::
@ -184,7 +187,7 @@ public:
    // in the common case.
    if (C10_UNLIKELY(!kernel.isValidUnboxed())) {
      if (!kernel.isValid()) {
-        reportError(ks.highestPriorityTypeId());
+        reportError(k);
      }
    }
    return kernel;
@ -208,7 +211,7 @@ private:
  OperatorName name_;
  c10::optional<AnnotatedSchema> schema_;

-  std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
+  std::array<KernelFunction, c10::getDispatchTableIndexForDispatchKey(DispatchKey::NumDispatchKeys)> dispatchTable_;
  DispatchKeyExtractor dispatchKeyExtractor_;

  // kernels_ stores all registered kernels for the corresponding dispatch key
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -45,6 +45,10 @@ namespace c10 {
  _(prim, CudaFusionGuard)           \
  _(prim, FunctionalGraph)           \
  _(prim, add_optional)              \
+  _(prim, view_copy)                 \
+  _(prim, reshape_copy)              \
+  _(prim, squeeze_copy)              \
+  _(prim, unsqueeze_copy)            \
  _(prim, DifferentiableGraph)       \
  _(prim, TensorExprGroup)           \
  _(prim, TensorExprDynamicGroup)    \
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@ -591,7 +591,7 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) {

 void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
-    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(toBackendComponent(key)))
+    .kernel<decltype(nonautograd_kernel), &nonautograd_kernel>(c10::getAutogradKeyFromBackend(key))
    .kernel<decltype(autograd_kernel), &autograd_kernel>(DispatchKey::Autograd));

  auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
@ -1791,22 +1791,22 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) {

 TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) {
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  bool fpga_called, math_called = false;
+  bool sparsecpu_called, math_called = false;
  auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; }));
  m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });

  auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
  ASSERT_TRUE(op.has_value());

  {
-    callOp(*op, dummyTensor(c10::DispatchKey::FPGA));
-    ASSERT_TRUE(fpga_called);
+    callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU));
+    ASSERT_TRUE(sparsecpu_called);
  }

  {
    expectThrows<c10::Error>([&] {
-      callOp(*op, dummyTensor(c10::DispatchKey::FPGA, /*requires_grad=*/true));
+      callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true));
    }, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther.");
  }
 }
@ -1849,15 +1849,18 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) {
  }

  {
+    // TODO(#43908): currently this will fallthrough AutogradPrivateUse1 then call catchall kernel
+    // at AutogradCPU, while backend extenders are indeed expecting to call PrivateUse1 kernel.
+    // This confusing behavior is caused by we registering fallthrough as backend fallback for
+    // Autograd keys. Note users could always work around this by registering the same kernel to
+    // AutogradPrivateUse1 as shown below until we support it.
    auto op = Dispatcher::singleton().findOp({"test::fn", ""});
    ASSERT_TRUE(op.has_value());
    catchall_called = false;
-    privateuse1_called = false;
    callOp(*op,
           dummyTensor(c10::DispatchKey::PrivateUse1, /*requires_grad=*/true),
           dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true));
-    ASSERT_FALSE(catchall_called);
-    ASSERT_TRUE(privateuse1_called);
+    ASSERT_TRUE(catchall_called);
  }

  m.impl("fn", c10::DispatchKey::AutogradPrivateUse1, [&](const Tensor& x, const Tensor& y) { privateuse1_called = true; return x; });
@ -1873,27 +1876,6 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) {
  }
 }

-TEST(NewOperatorRegistrationTest, registerCompositeImplicitAutogradWithCPUKernel_andCallAutogradOtherKernel_callsComposite) {
-  bool math_called = false;
-  bool cpu_called = false;
-  auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn(Tensor dummy) -> Tensor");
-  m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { cpu_called = true; return x; });
-  m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
-
-  auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
-  ASSERT_TRUE(op.has_value());
-
-  {
-    math_called = cpu_called = false;
-    // Meta should redispatch to the AutogradOther backend,
-    // which the composite kernel should be registered to.
-    callOp(*op, dummyTensor(c10::DispatchKey::Meta, /*requires_grad=*/true));
-    ASSERT_TRUE(math_called);
-    ASSERT_FALSE(cpu_called);
-  }
-}
-
 TEST(NewOperatorRegistrationTest, dispatchMultiple) {
  bool cpu_called = false;
  bool cuda_called = false;
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@ -2,7 +2,7 @@

 #include <ATen/cuda/ApplyGridUtils.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/TensorUtils.h>
+#include <ATen/core/TensorBase.h>
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@ -378,12 +378,14 @@ kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
 template <typename scalar1, typename scalar2, int step, typename Op,
          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
-inline bool CUDA_tensor_apply2(at::Tensor a,
-                               at::Tensor b,
+inline bool CUDA_tensor_apply2(at::TensorBase a,
+                               at::TensorBase b,
                               const Op op,
                               TensorArgType aType = TensorArgType::ReadWrite,
                               TensorArgType bType = TensorArgType::ReadOnly) {
-  checkDeviceType("CUDA_tensor_apply2", {a, b}, DeviceType::CUDA);
+  TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
+              "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
+              "tensors with type ", a.device().type(), " and ", b.device().type());
  int64_t totalElements = a.numel();

  if (totalElements != b.numel()) {
@ -413,8 +415,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
  This ensures that each element of the tensor is operated on once and only
  once.
  */
-  Tensor oldA;
-  Tensor oldB;
+  TensorBase oldA;
+  TensorBase oldB;

  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
    // Must perform in contiguous space
@ -524,8 +526,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 template <typename scalar1, typename scalar2, typename Op,
          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
-inline bool CUDA_tensor_apply2(at::Tensor a,
-                               at::Tensor b,
+inline bool CUDA_tensor_apply2(const at::TensorBase &a,
+                               const at::TensorBase &b,
                               const Op op,
                               TensorArgType aType = TensorArgType::ReadWrite,
                               TensorArgType bType = TensorArgType::ReadOnly) {
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@ -1,9 +1,7 @@
 #pragma once

-#include <c10/core/GeneratorImpl.h>
 #include <ATen/core/Generator.h>
 #include <ATen/cuda/detail/PhiloxCudaStateRaw.cuh>
-#include <ATen/Tensor.h>
 #include <ATen/Context.h>
 #include <limits>

--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -258,7 +258,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
  return self;
 }

-void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) {
+void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) {
  // Called when we are copying into an overlapping index `dst`, but we don't
  // care which writer wins. Hacky but it works. This is only used by
  // CUDA_tensor_apply2 in case that there are write overlaps.
--- a/aten/src/ATen/native/Copy.h
+++ b/aten/src/ATen/native/Copy.h
@ -6,6 +6,7 @@ namespace at {

 class Tensor;
 struct TensorIterator;
+class TensorBase;

 namespace native {

@ -13,7 +14,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking);

 DECLARE_DISPATCH(copy_fn, copy_stub);

-TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src);
+TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);

 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@ -1,7 +1,5 @@
 #pragma once

-#include <ATen/ATen.h>
-#include <ATen/ExpandUtils.h>
 #include <ATen/native/Math.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@ -864,8 +864,13 @@ Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
    }
  }

-  return grid_sampler_2d_cpu_kernel(
-    kCPU, input, grid, interpolation_mode, padding_mode, align_corners);
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+  grid_sampler_2d_cpu_kernel(
+      kCPU, output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
 }

 DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel);
@ -911,8 +916,15 @@ grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, con
    }
  }

-  return grid_sampler_2d_backward_cpu_kernel(
-    kCPU, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  Tensor grad_input;
+  if (output_mask[0]) {
+    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  grid_sampler_2d_backward_cpu_kernel(
+      kCPU, grad_input, grad_grid, grad_output, input, grid,
+      interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(std::move(grad_input), std::move(grad_grid));
 }

 DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel);
--- a/aten/src/ATen/native/GridSampler.h
+++ b/aten/src/ATen/native/GridSampler.h
@ -1,7 +1,9 @@
 #pragma once

-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <utility>

 namespace at { namespace native {

--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1627,8 +1627,7 @@ Tensor matmul(
    Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
    auto size1 = tensor1.sizes();
    auto size2 = t2.sizes();
-    std::vector<int64_t> output_size;
-    output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
+    DimVector output_size(size1.begin(), size1.end() - 1);
    if (dim_tensor2 > 1) {
      output_size.push_back(size2[dim_tensor2 - 1]);
    }
@ -1660,7 +1659,8 @@ Tensor matmul(
      return has_out ? out.set_(res) : res;
    }
    else {
-      std::vector<int64_t> shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec();
+      c10::IntArrayRef shape_array = tensor2.sizes().slice(0, dim_tensor2 - 2);
+      DimVector shape(shape_array.begin(), shape_array.end());
      shape.push_back(p);

      Tensor res = res_T.reshape(shape).contiguous();
@ -1677,29 +1677,29 @@ Tensor matmul(
    IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max<int64_t>(dim_tensor2 - 2, 0));

    // expand the batch portion (i.e. cut off matrix dimensions and expand rest)
-    std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
+    DimVector expand_batch_portion = infer_size_dimvector(batch_tensor1, batch_tensor2);

-    std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
-    tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1});
+    DimVector tensor1_expand_size(expand_batch_portion);
+    tensor1_expand_size.push_back(n);
+    tensor1_expand_size.push_back(m1);

-    std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
-    tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p});
+    DimVector tensor2_expand_size(expand_batch_portion);
+    tensor2_expand_size.push_back(m2);
+    tensor2_expand_size.push_back(p);

    const int64_t expand_batch_product =
        c10::multiply_integers(expand_batch_portion);

-    std::vector<int64_t> tensor1_bmm_view({expand_batch_product});
-    tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1});
+    std::array<int64_t, 3> tensor1_bmm_view = {expand_batch_product, n, m1};

-    std::vector<int64_t> tensor2_bmm_view({expand_batch_product});
-    tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p});
+    std::array<int64_t, 3> tensor2_bmm_view = {expand_batch_product, m2, p};

    // flatten expanded batches
    Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(tensor1_bmm_view);
    Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(tensor2_bmm_view);

    // reshape batches back into result
-    std::vector<int64_t> output_shape(expand_batch_portion);
+    DimVector output_shape(expand_batch_portion);
    if (dim_tensor1 > 1) {
      output_shape.push_back(n);
    }
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@ -45,6 +45,19 @@ namespace native {
 DEFINE_DISPATCH(sort_stub);
 DEFINE_DISPATCH(topk_stub);

+void _fill_indices(const TensorBase &indices, int64_t dim) {
+  auto ndim = indices.dim();
+  assert(0 <= dim && dim < ndim);
+  auto dim_size = indices.size(dim);
+  auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong));
+  auto idx_dim_sizes = std::vector<int64_t>(ndim, 1);
+  auto idx_dim_strides = std::vector<int64_t>(ndim, 0);
+  idx_dim_sizes[dim] = dim_size;
+  idx_dim_strides[dim] = 1;
+  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
+  OptionalTensorRef(indices)->copy_(idx_dim_restrided);
+}
+
 namespace {

 /* Note from TH:
--- a/aten/src/ATen/native/Sorting.h
+++ b/aten/src/ATen/native/Sorting.h
@ -1,8 +1,11 @@
 #pragma once

-#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>

+namespace at {
+class TensorBase;
+}
+
 namespace at {
 namespace native {

@ -14,11 +17,13 @@ enum class QUANTILE_INTERPOLATION_MODE : uint8_t {
  NEAREST
 };

-using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending, bool stable);
-using topk_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
+using sort_fn = void(*)(const TensorBase &values, const TensorBase &indices, int64_t dim, bool descending, bool stable);
+using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool);

 DECLARE_DISPATCH(sort_fn, sort_stub);
 DECLARE_DISPATCH(topk_fn, topk_stub);

+void _fill_indices(const TensorBase &indices, int64_t dim);
+
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/SortingUtils.h
+++ b/aten/src/ATen/native/SortingUtils.h
@ -86,92 +86,5 @@ inline void _allocate_or_resize_output_with_indices(
  }
 }

-
-#ifdef CPU_CAPABILITY
-inline namespace CPU_CAPABILITY {
-#else
-inline namespace DEFAULT {
-#endif
-
-// Core topk loop, shared between CPU and QuantizedCPU
-template <typename scalar_t, typename accscalar_t>
-void topk_impl_loop(
-    const int64_t mode_values_stride,
-    const int64_t mode_indices_stride,
-    const int64_t tmp_values_stride,
-    const int64_t k,
-    const int64_t dim_size,
-    const bool largest,
-    const bool sorted,
-    char** data, const int64_t* strides, const int64_t n) {
-
-  using elem_t = std::pair<accscalar_t, int64_t>;
-  std::vector<elem_t> queue(dim_size);
-  for (const auto i : c10::irange(n)) {
-    TensorAccessor<scalar_t, 1> mode_values(
-        reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
-        &k, &mode_values_stride);
-    TensorAccessor<int64_t, 1> mode_indices(
-        reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
-        &k, &mode_indices_stride);
-    TensorAccessor<scalar_t, 1> tmp_values(
-        reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
-        &dim_size, &tmp_values_stride);
-
-    auto n = dim_size;
-    auto use_partial_sort = k * 64 <= n;
-
-    for (const auto j : c10::irange(n)) {
-      queue[j].first = tmp_values[j];
-      queue[j].second = j;
-    }
-
-    // we want nan to be sorted as top for numpy compatibility
-    if (use_partial_sort) {
-      if (largest) {
-        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
-          });
-      } else {
-        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
-          });
-      }
-    } else {
-      if (largest) {
-        std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
-          });
-        if (sorted) {
-          std::sort(queue.begin(), queue.begin() + k - 1,
-            [](const elem_t& x, const elem_t& y) -> bool {
-              return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
-            });
-        }
-      } else {
-        std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
-          [](const elem_t& x, const elem_t& y) -> bool {
-            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
-          });
-        if (sorted) {
-          std::sort(queue.begin(), queue.begin() + k -1,
-            [](const elem_t& x, const elem_t& y) -> bool {
-              return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
-            });
-        }
-      }
-    }
-
-    for (const auto j : c10::irange(k)) {
-      mode_values[j] = queue[j].first;
-      mode_indices[j] = queue[j].second;
-    }
-  }
-}
-
-} // namespace CPU_CAPABILITY
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -4,6 +4,7 @@
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/WrapDimUtils.h>
 #include <c10/util/irange.h>

 #include <algorithm>
--- a/aten/src/ATen/native/TopKImpl.h
+++ b/aten/src/ATen/native/TopKImpl.h
@ -0,0 +1,95 @@
+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/NumericUtils.h>
+
+namespace at {
+namespace native {
+
+#ifdef CPU_CAPABILITY
+inline namespace CPU_CAPABILITY {
+#else
+inline namespace DEFAULT {
+#endif
+
+// Core topk loop, shared between CPU and QuantizedCPU
+template <typename scalar_t, typename accscalar_t>
+void topk_impl_loop(
+    const int64_t mode_values_stride,
+    const int64_t mode_indices_stride,
+    const int64_t tmp_values_stride,
+    const int64_t k,
+    const int64_t dim_size,
+    const bool largest,
+    const bool sorted,
+    char** data, const int64_t* strides, const int64_t n) {
+
+  using elem_t = std::pair<accscalar_t, int64_t>;
+  std::vector<elem_t> queue(dim_size);
+  for (const auto i : c10::irange(n)) {
+    TensorAccessor<scalar_t, 1> mode_values(
+        reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
+        &k, &mode_values_stride);
+    TensorAccessor<int64_t, 1> mode_indices(
+        reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
+        &k, &mode_indices_stride);
+    TensorAccessor<scalar_t, 1> tmp_values(
+        reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
+        &dim_size, &tmp_values_stride);
+
+    auto n = dim_size;
+    auto use_partial_sort = k * 64 <= n;
+
+    for (const auto j : c10::irange(n)) {
+      queue[j].first = tmp_values[j];
+      queue[j].second = j;
+    }
+
+    // we want nan to be sorted as top for numpy compatibility
+    if (use_partial_sort) {
+      if (largest) {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+      } else {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+      }
+    } else {
+      if (largest) {
+        std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k - 1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+            });
+        }
+      } else {
+        std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k -1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+            });
+        }
+      }
+    }
+
+    for (const auto j : c10::irange(k)) {
+      mode_values[j] = queue[j].first;
+      mode_indices[j] = queue[j].second;
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace native
+} // namespace at
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@ -6,7 +6,7 @@

 namespace at {
 class Tensor;
-struct TensorIterator;
+class TensorBase;
 struct TensorIteratorBase;
 }

@ -73,14 +73,14 @@ DECLARE_DISPATCH(unary_fn, trunc_stub);
 DECLARE_DISPATCH(unary_fn, lgamma_stub);

 // NB: these are actually defined in Distribution
-DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, c10::optional<Generator>), bernoulli_tensor_stub);
-DECLARE_DISPATCH(void(*)(Tensor&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
-DECLARE_DISPATCH(void(*)(Tensor&, const double, const double, c10::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
--- a/aten/src/ATen/native/attention.cpp
+++ b/aten/src/ATen/native/attention.cpp
@ -14,10 +14,7 @@ namespace native {
 namespace {

 Tensor gemm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto b_ = b.transpose(1, 0);
-  auto c_ = at::native::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), b.size(0)});
+  return at::native::matmul(a, b.t());
 }

 // compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
@ -45,7 +42,7 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
        const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(dim_per_head));

        int64_t grain_size =
-            std::min(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
+            std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
        parallel_for(
            0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) {
              for (auto i : c10::irange(begin, end)) {
@ -56,8 +53,8 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
                auto b = i;
                using Vec = vec::Vectorized<scalar_t>;
                auto V = vec::Vectorized<scalar_t>::size();
-                // TODO: handle epilogue
-                for (auto dh = 0; dh < dim_per_head / V; dh += V) {
+                auto dh = 0;
+                for (; dh < dim_per_head; dh += V) {
                  auto d = nh * dim_per_head + dh;
                  // load
                  auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]);
@ -79,19 +76,43 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
                  q_data.store(&q_k_v_data
                                   [0 * B * num_head * T * dim_per_head +
                                    b * num_head * T * dim_per_head +
-                                    num_head * T * dim_per_head +
+                                    nh * T * dim_per_head +
                                    t * dim_per_head + dh]);
                  k_data.store(&q_k_v_data
                                   [1 * B * num_head * T * dim_per_head +
                                    b * num_head * T * dim_per_head +
-                                    num_head * T * dim_per_head +
+                                    nh * T * dim_per_head +
                                    t * dim_per_head + dh]);
                  v_data.store(&q_k_v_data
                                   [2 * B * num_head * T * dim_per_head +
                                    b * num_head * T * dim_per_head +
-                                    num_head * T * dim_per_head +
+                                    nh * T * dim_per_head +
                                    t * dim_per_head + dh]);
                }
+                if (dh != dim_per_head) {
+                  for (dh = std::max(0, dh - V); dh < dim_per_head; dh++) {
+                    auto d = nh * dim_per_head + dh;
+                    auto q_bias = qkv_bias_data[d + 0 * D];
+                    auto k_bias = qkv_bias_data[d + 1 * D];
+                    auto v_bias = qkv_bias_data[d + 2 * D];
+                    auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias;
+                    auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias;
+                    auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias;
+                    q_data = q_data / sqrt_dim_per_head;
+                    q_k_v_data[0 * B * num_head * T * dim_per_head +
+                               b * num_head * T * dim_per_head +
+                               nh * T * dim_per_head +
+                               t * dim_per_head + dh] = q_data;
+                    q_k_v_data[1 * B * num_head * T * dim_per_head +
+                               b * num_head * T * dim_per_head +
+                               nh * T * dim_per_head +
+                               t * dim_per_head + dh] = k_data;
+                    q_k_v_data[2 * B * num_head * T * dim_per_head +
+                               b * num_head * T * dim_per_head +
+                               nh * T * dim_per_head +
+                               t * dim_per_head + dh] = v_data;
+                  }
+                }
              }
            });
      });
@ -110,13 +131,16 @@ Tensor bmm_nt(const Tensor& a, const Tensor& b) {
 }

 void masked_softmax_dropout(
-    const Tensor& attn_scores,
+    Tensor& attn_scores,
    const c10::optional<Tensor>& attn_mask) {
  auto B = attn_scores.size(0);
  auto num_heads = attn_scores.size(1);
  auto T = attn_scores.size(2);
  if (attn_mask) {
    TORCH_CHECK(attn_mask->is_contiguous());
+  } else {
+    at::_softmax_out(attn_scores, attn_scores, 3, false);
+    return;
  }
  AT_DISPATCH_FLOATING_TYPES_AND2(
      ScalarType::Half,
@ -134,9 +158,10 @@ void masked_softmax_dropout(
                using Vec = vec::Vectorized<scalar_t>;
                auto V = vec::Vectorized<scalar_t>::size();

-                scalar_t* input_data = attn_scores_data + i * T;
+                scalar_t* input_data = attn_scores_data + i;
                auto max_input = Vec(std::numeric_limits<scalar_t>::lowest());
                // TODO: handle epilogue
+                TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
                for (auto t = 0; t < T; t += V) {
                  auto v = Vec::loadu(&input_data[t]);
                  max_input = vec::maximum(max_input, v);
@ -147,6 +172,7 @@ void masked_softmax_dropout(
                  hmax = std::max(max_input[i], hmax);
                }
                accscalar_t hsum = 0;
+                TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
                for (auto t = 0; t < T; t += V) {
                  auto v = Vec::loadu(&input_data[t]);
                  // TODO: vectorize in accscalar_t?
@ -155,6 +181,7 @@ void masked_softmax_dropout(
                  }
                }
                auto inv_denominator = 1.0 / hsum;
+                TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
                for (auto t = 0; t < T; t += V) {
                  Vec v = Vec::loadu(&input_data[t]);

@ -185,6 +212,8 @@ Tensor bmm_nn(const Tensor& a, const Tensor& b) {

 Tensor transform_0213(const Tensor& a) {
  // TODO: check perf vs dedicated kernel.
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
  return a.permute({0, 2, 1, 3})
      .contiguous()
      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
@ -196,6 +225,13 @@ Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
  return r_.view({a.size(0), a.size(1), r_.size(1)});
 }

+void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
+  for (auto idx : c10::irange(shape.size())) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
+  }
+}
+
 } // namespace

 Tensor multi_head_self_attention_cpu(
@ -209,30 +245,63 @@ Tensor multi_head_self_attention_cpu(
  // query shape: [B, T, D]
  // qkv_weight shape: [3 * D, D]

+  const auto D = query.sizes()[2];
+
+  TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
+  TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
+  TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
+  TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
+  TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
+
+#ifndef NDEBUG
+  const auto B = query.sizes()[0];
+  const auto T = query.sizes()[1];
+  const auto dim_per_head = D / num_head;
+#endif
+
  // shape: [B, T, 3 x D]
  auto qkv = gemm_nt(query, qkv_weight);
+#ifndef NDEBUG
+  debug_assert_shape(qkv, {B, T, 3 * D});
+#endif

  // shape: 3 x [B, num_head, T, dim_per_head]
  auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  auto q = std::get<0>(q_k_v);
-  auto k = std::get<1>(q_k_v);
-  auto v = std::get<2>(q_k_v);
+  const auto& q = std::get<0>(q_k_v);
+  const auto& k = std::get<1>(q_k_v);
+  const auto& v = std::get<2>(q_k_v);
+#ifndef NDEBUG
+  debug_assert_shape(q, {B, num_head, T, dim_per_head});
+  debug_assert_shape(k, {B, num_head, T, dim_per_head});
+  debug_assert_shape(v, {B, num_head, T, dim_per_head});
+#endif

  // shape: [B, num_head, T, T]
  auto qkt = bmm_nt(q, k);
+#ifndef NDEBUG
+  debug_assert_shape(qkt, {B, num_head, T, T});
+#endif

  // shape: [B, num_head, T, T]
  masked_softmax_dropout(qkt, mask);

  // shape: [B, num_head, T, dim_per_head]
  auto attn_ctx = bmm_nn(qkt, v);
+#ifndef NDEBUG
+  debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
+#endif

  // shape: [B, T, D]
  auto attn = transform_0213(attn_ctx);
+#ifndef NDEBUG
+  debug_assert_shape(attn, {B, T, D});
+#endif

  // shape: [B, T, D]
  auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
-
+#ifndef NDEBUG
+  debug_assert_shape(proj, {B, T, D});
+#endif
  return proj;
 }

--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@ -1,9 +1,9 @@
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
 #include <ATen/Generator.h>
 #include <ATen/core/DistributionsHelper.h>
 #include <ATen/native/Distributions.h>
-#include <ATen/native/TensorFactories.h>
 #include <ATen/native/cpu/DistributionTemplates.h>

 #include <ATen/native/UnaryOps.h>
@ -25,22 +25,22 @@ static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma,
  templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }

-void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
  CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
  templates::cpu::bernoulli_kernel(self, p_, generator);
 }

-void bernoulli_scalar_kernel_default(Tensor& self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional<Generator> gen) {
  CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
  templates::cpu::bernoulli_kernel(self, p, generator);
 }

 #if !AT_MKL_ENABLED()
-void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
  bernoulli_scalar_kernel_default(self, p, gen);
 }
 #else
-void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
  if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
    CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
    int64_t seed;
@ -87,7 +87,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional<Generator> ge

      // copy_ if using buffer and non contiguous
      if (!contig) {
-        self.copy_(tmp_int_tensor);
+        OptionalTensorRef(self)->copy_(tmp_int_tensor);
      }
    });
  } else {
@ -117,7 +117,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optio
  templates::cpu::uniform_kernel(iter, from, to, generator);
 }

-void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
  CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
  templates::cpu::normal_kernel(self, mean, std, generator);
 }
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@ -1,7 +1,8 @@
 #pragma once

-#include <ATen/Dispatch.h>
 #include <ATen/CPUApplyUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandBase.h>
 #include <ATen/core/DistributionsHelper.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
@ -105,7 +106,7 @@ static void normal_fill_16_AVX2(float *data,
 }

 template<typename RNG>
-void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG generator) {
+void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) {
  float *data = self.data_ptr<float>();
  auto size = self.numel();
  std::lock_guard<std::mutex> lock(generator->mutex_);
@ -148,7 +149,7 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s
 }

 template <typename scalar_t, typename RNG>
-void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG generator) {
+void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
  scalar_t *data = self.data_ptr<scalar_t>();
  auto size = self.numel();
  std::lock_guard<std::mutex> lock(generator->mutex_);
@ -172,7 +173,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene
 }

 template<typename RNG>
-void normal_kernel(Tensor& self, double mean, double std, RNG generator) {
+void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) {
  auto size = self.numel();
  if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) {
 #ifdef CPU_CAPABILITY_AVX2
@ -308,25 +309,25 @@ struct ExponentialKernel {
 // ================================================== Bernoulli =======================================================

 template<typename RNG>
-void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) {
  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(generator->mutex_);
    using self_t = scalar_t;
    auto p_cpu = p_.to(kCPU);
-    c10::MaybeOwned<Tensor> p = expand_inplace(self, p_cpu);
+    auto p = expand_inplace(self, p_cpu);
    auto iter = TensorIteratorConfig()
        .add_output(self)
        .add_input(*p)
        .check_all_same_dtype(false)
        .build();
-    if (p_.scalar_type() == kDouble) {
+    if (p->scalar_type() == kDouble) {
      cpu_serial_kernel(iter, [&](const double p_val) -> self_t {
        at::bernoulli_distribution<double> bernoulli(p_val);
        return static_cast<self_t>(bernoulli(generator));
      });
    } else {
-      AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
+      AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
        using p_t = scalar_t;
        cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t {
          at::bernoulli_distribution<float> bernoulli(p_val);
@ -338,7 +339,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) {
 }

 template<typename RNG>
-void bernoulli_kernel(Tensor& self, double p, RNG generator) {
+void bernoulli_kernel(const TensorBase &self, double p, RNG generator) {
  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
    // See Note [Acquire lock when using random generators]
    std::lock_guard<std::mutex> lock(generator->mutex_);
@ -352,10 +353,10 @@ void bernoulli_kernel(Tensor& self, double p, RNG generator) {

 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(Tensor& self, double p, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, double p, c10::optional<Generator> gen) {
    bernoulli_kernel(self, p, check_generator<RNG>(gen));
  }
-  void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
  }
 };
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@ -1,11 +1,12 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/Parallel.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/GridSampler.h>
 #include <ATen/native/cpu/GridSamplerKernel.h>
-#include <ATen/cpu/vml.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/cpu/vec/vec.h>
 #include <c10/util/C++17.h>
 #include <c10/util/irange.h>

@ -1146,13 +1147,12 @@ static inline void grid_sample_2d_grid_slice_iterator(
 // and backward.
 // See NOTE [ Grid Sample CPU Kernels ] for details.

-Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
-                                       int64_t interpolation_mode,
-                                       int64_t padding_mode, bool align_corners) {
+void grid_sampler_2d_cpu_kernel_impl(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
  auto N = input.size(0);
  auto H = grid.size(1);
  auto W = grid.size(2);
-  auto output = at::empty({N, input.size(1), H, W}, input.options());
  auto spatial_size = H * W;
  auto grain_size = spatial_size == 0 ? (N + 1)
                                      : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/);
@ -1207,14 +1207,14 @@ Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
  });
 #undef HANDLE_CASE
 #undef HANDLE_INTERP
-
-  return output;
 }

-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
-                                         const Tensor& input,
-                                         const Tensor& grid,
+void grid_sampler_2d_backward_cpu_kernel_impl(
+    const TensorBase &grad_input,
+    const TensorBase &grad_grid,
+    const TensorBase &grad_output_,
+    const TensorBase &input,
+    const TensorBase &grid,
    int64_t interpolation_mode,
    int64_t padding_mode,
    bool align_corners,
@ -1228,11 +1228,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
  // is always computed.)
  auto input_requires_grad = output_mask[0];

-  Tensor grad_input;
-  if (input_requires_grad) {
-    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  auto N = input.size(0);
  auto spatial_size = grid.size(1) * grid.size(2);
  auto grain_size = spatial_size == 0 ? (N + 1)
@ -1315,8 +1310,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
  });
 #undef HANDLE_CASE
 #undef HANDLE_INTERP
-
-  return std::make_tuple(grad_input, grad_grid);
 }

 }
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.h
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h
@ -1,17 +1,33 @@
 #pragma once

-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/cpu/vml.h>

-#include <tuple>
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}

 namespace at { namespace native {

-using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t, bool);
-using backward_2d_fn = std::tuple<Tensor, Tensor>(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool, std::array<bool,2>);
+using forward_2d_fn = void (*) (
+    const TensorBase &output,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners);
+using backward_2d_fn = void (*) (
+    const TensorBase &grad_input,
+    const TensorBase &grad_grid,
+    const TensorBase &grad_output,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners,
+    std::array<bool, 2> output_mask);
 DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel);
 DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel);

--- a/aten/src/ATen/native/cpu/SortingKernel.cpp
+++ b/aten/src/ATen/native/cpu/SortingKernel.cpp
@ -1,33 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/Sorting.h>
+#include <ATen/core/TensorBase.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/StridedRandomAccessor.h>
 #include <ATen/native/CompositeRandomAccessor.h>
-#include <ATen/native/Sorting.h>
-#include <ATen/native/SortingUtils.h>
+#include <ATen/native/TopKImpl.h>
 #include <c10/util/irange.h>

 namespace at { namespace native {

 namespace {

-void _fill_indices(Tensor& indices, int64_t dim) {
-  auto dim_size = indices.size(dim);
-  auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong));
-  auto idx_dim_sizes = std::vector<int64_t>(indices.dim(), 1);
-  auto idx_dim_strides = std::vector<int64_t>(indices.dim(), 0);
-  idx_dim_sizes[dim] = dim_size;
-  idx_dim_strides[dim] = 1;
-  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
-  indices.copy_(idx_dim_restrided);
-}
-
 template <typename func_t>
 void _dim_apply(
-    Tensor& values,
-    Tensor& indices,
+    const TensorBase &values,
+    const TensorBase &indices,
    int64_t dim,
    const std::string& method_name,
    const func_t& f) {
@ -95,8 +85,8 @@ struct KeyValueCompDesc {
 };

 static void sort_kernel(
-    Tensor& values,
-    Tensor& indices,
+    const TensorBase &values,
+    const TensorBase &indices,
    int64_t dim,
    bool descending,
    bool stable) {
@ -143,9 +133,9 @@ static void sort_kernel(
 }

 static void topk_kernel(
-    const Tensor& values,
-    const Tensor& indices,
-    const Tensor& self,
+    const TensorBase &values,
+    const TensorBase &indices,
+    const TensorBase &self,
    int64_t k,
    int64_t dim,
    bool largest,
--- a/aten/src/ATen/native/cuda/Activation.h
+++ b/aten/src/ATen/native/cuda/Activation.h
@ -1,4 +1,4 @@
-
+#pragma once
 #include <ATen/native/Activation.h>
 #include <cstdint>

--- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu
+++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@ -1,6 +1,5 @@
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
@ -24,12 +23,12 @@

 namespace at { namespace native {

-void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional<Generator> gen_) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen_) {
  auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
  at::native::templates::cuda::bernoulli_kernel(self, p_, generator);
 }

-void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional<Generator> gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
  auto iter = TensorIterator::borrowing_nullary_op(self);
  auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
  at::native::templates::cuda::bernoulli_kernel(iter, p, generator);
--- a/aten/src/ATen/native/cuda/DistributionNormal.cu
+++ b/aten/src/ATen/native/cuda/DistributionNormal.cu
@ -1,30 +1,11 @@
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/AccumulateType.h>
-#include <ATen/cuda/CUDAGeneratorImpl.h>
+#define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/native/UnaryOps.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/native/cuda/DistributionTemplates.h>

-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
-#include <utility>
-#include <functional>
-
-#include <ATen/native/Distributions.h>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/TensorIterator.h>
-
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <type_traits>
-
 namespace at { namespace native {

-void normal_kernel(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
  auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
  at::native::templates::cuda::normal_kernel(self, mean, std, generator);
 }
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@ -2,7 +2,7 @@

 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
+#include <ATen/ExpandBase.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <c10/util/Half.h>
@ -430,7 +430,7 @@ void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transfo
 // ==================================================== Normal ========================================================

 template<typename RNG>
-void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) {
+void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
  auto iter = TensorIterator::borrowing_nullary_op(self);
  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] {
    using accscalar_t = at::acc_type<scalar_t, true>;
@ -446,7 +446,7 @@ void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) {

 template<typename RNG>
 struct NormalKernel {
-  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
    normal_kernel(self, mean, std, check_generator<RNG>(gen));
  }
 };
@ -574,7 +574,7 @@ struct CauchyKernel {

 template<typename scalar_t, typename prob_t>
 void bernoulli_tensor_cuda_kernel(
-    at::Tensor& ret, const at::Tensor& p,
+    const TensorBase &ret, const at::TensorBase &p,
    PhiloxCudaState philox_args) {
  auto functor = [philox_args] __device__(
          int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel(
 }

 template<typename RNG>
-void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
  PhiloxCudaState rng_engine_inputs;
  {
    // See Note [Acquire lock when using random generators]
@ -626,14 +626,10 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) {
    rng_engine_inputs = gen->philox_cuda_state(10);
  }
  TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type());
-  auto p_CUDA = p_.to(kCUDA);
-  //cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
-  if (self.dtype() == at::kDouble) {
-    p_CUDA = p_CUDA.to(at::kDouble);
-  } else {
-    p_CUDA = p_CUDA.to(at::kFloat);
-  }
-  c10::MaybeOwned<Tensor> p = expand_inplace(self, p_CUDA);
+  // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
+  const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat;
+  auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type));
+  auto p = expand_inplace(self, p_cuda);
  AT_DISPATCH_ALL_TYPES_AND3(
    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
      if (std::is_same<scalar_t, double>::value) {
@ -662,7 +658,7 @@ struct BernoulliKernel {
  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
    bernoulli_kernel(iter, p, check_generator<RNG>(gen));
  }
-  void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
  }
 };
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@ -1,5 +1,5 @@
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/AccumulateType.h>
--- a/aten/src/ATen/native/cuda/GridSampler.cpp
+++ b/aten/src/ATen/native/cuda/GridSampler.cpp
@ -0,0 +1,72 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/cuda/GridSampler.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/grid_sampler_2d_backward_native.h>
+#include <ATen/ops/grid_sampler_2d_native.h>
+#include <ATen/ops/grid_sampler_3d_backward_native.h>
+#include <ATen/ops/grid_sampler_3d_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+namespace at {
+namespace native {
+
+Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode,
+                            bool align_corners) {
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+  launch_grid_sampler_2d_forward_kernel(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode,
+                            bool align_corners) {
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2], grid_size[3]},
+      input.options());
+  launch_grid_sampler_3d_forward_kernel(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
+                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners, std::array<bool, 2> output_mask) {
+  Tensor grad_input;
+  if (output_mask[0]) {
+    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  launch_grid_sampler_2d_backward_kernel(
+      grad_input, grad_grid, grad_output, input,
+      grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
+                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners) {
+  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  launch_grid_sampler_3d_backward_kernel(
+      grad_input, grad_grid, grad_output, input,
+      grid, interpolation_mode, padding_mode, align_corners);
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+}}  // namespace at::native
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@ -1,10 +1,13 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/cuda/GridSampler.h>
 #include <ATen/native/cuda/GridSampler.cuh>
 #include <ATen/native/cuda/UpSample.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
 #include <c10/macros/Macros.h>

 namespace at { namespace native {
@ -723,14 +726,12 @@ namespace {
 }  // namespace

 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners) {
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
  auto N = input.size(0);
-  auto C = input.size(1);
  auto H = grid.size(1);
  auto W = grid.size(2);
-  auto output = at::empty({N, C, H, W}, input.options());
  int64_t count = N * H * W;
  if (count > 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] {
@ -760,18 +761,16 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
      }
    });
  }
-  return output;
 }

 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners) {
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
  auto N = input.size(0);
  auto D = grid.size(1);
  auto H = grid.size(2);
  auto W = grid.size(3);
-  auto output = at::empty({N, input.size(1), D, H, W}, input.options());
  int64_t count = N * D * H * W;
  if (count > 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_cuda", [&] {
@ -801,15 +800,14 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
      }
    });
  }
-  return output;
 }

 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode,
-                              int64_t padding_mode, bool align_corners,
-                              std::array<bool,2> output_mask) {
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool,2> output_mask) {
  // See Note [Writing Nondeterministic Operations]
  // Nondeterministic because of atomicAdd usage
  globalContext().alertNotDeterministic("grid_sampler_2d_backward_cuda");
@ -822,11 +820,6 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
  // is always computed.)
  auto input_requires_grad = output_mask[0];

-  Tensor grad_input;
-  if (input_requires_grad) {
-    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  int64_t count = N * H * W;
  if (count > 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] {
@ -864,13 +857,13 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
      }
    });
  }
-  return std::make_tuple(grad_input, grad_grid);
 }

 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase& grad_output, const TensorBase& input,
+    const TensorBase& grid, int64_t interpolation_mode, int64_t padding_mode,
    bool align_corners) {
  // See Note [Writing Nondeterministic Operations]
  // Nondeterministic because of atomicAdd usage
@ -879,8 +872,6 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
  auto D = grid.size(1);
  auto H = grid.size(2);
  auto W = grid.size(3);
-  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
  int64_t count = N * D * H * W;
  if (count > 0) {
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] {
@ -916,7 +907,6 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
      }
    });
  }
-  return std::make_tuple(grad_input, grad_grid);
 }

 }}  // namespace at::native
--- a/aten/src/ATen/native/cuda/GridSampler.cuh
+++ b/aten/src/ATen/native/cuda/GridSampler.cuh
@ -1,5 +1,3 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/cuda/KernelUtils.cuh>

 namespace at { namespace native {
--- a/aten/src/ATen/native/cuda/GridSampler.h
+++ b/aten/src/ATen/native/cuda/GridSampler.h
@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners);
+
+}}  // namespace at::native
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -905,15 +905,16 @@ Tensor& index_select_out_cuda(
 }

 Tensor index_select_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
-  Tensor out;
-  if (self.is_quantized()){
+  Tensor out = at::empty({0}, self.options());
+  at::native::index_select_out_cuda(self, dim, index, out);
+  return out;
+}
+
+Tensor index_select_quantized_cuda(const Tensor& self, int64_t dim, const Tensor& index) {
  TORCH_CHECK(
    self.qscheme() == kPerTensorAffine,
    "Only per_tensor quantized quantized tensors are supported by index_select.")
-    out = at::empty_quantized({0}, self);
-  } else {
-    out = at::empty({0}, self.options());
-  }
+  Tensor out = at::empty_quantized({0}, self);
  at::native::index_select_out_cuda(self, dim, index, out);
  return out;
 }
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@ -1,15 +1,19 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ceil_div.h>
 #include <ATen/Context.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/Resize.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/diag.h>
+#include <ATen/ops/trace_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_native.h>
 #endif

 #include <ATen/cuda/CUDAApplyUtils.cuh>
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@ -1,7 +1,10 @@
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
+#include <ATen/core/TensorAccessor.h>
 #include <ATen/cuda/Atomic.cuh>

+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+
 #include <math.h>

 namespace at {
--- a/aten/src/ATen/native/cuda/attention.cu
+++ b/aten/src/ATen/native/cuda/attention.cu
@ -23,10 +23,7 @@ namespace native {
 namespace {

 Tensor gemm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto b_ = b.transpose(1, 0);
-  auto c_ = at::native::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), b.size(0)});
+  return at::native::matmul(a, b.t());
 }

 template <typename scalar_t, typename accscalar_t>
@ -209,6 +206,14 @@ Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
  return r_.view({a.size(0), a.size(1), r_.size(1)});
 }

+void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
+  for (auto idx : c10::irange(shape.size())) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
+  }
+}
+
+
 } // namespace

 Tensor multi_head_self_attention_cuda(
@ -222,29 +227,63 @@ Tensor multi_head_self_attention_cuda(
  // query shape: [B, T, D]
  // qkv_weight shape: [3 * D, D]

+  const auto D = query.sizes()[2];
+
+  TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
+  TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
+  TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
+  TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
+  TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
+
+#ifndef NDEBUG
+  const auto B = query.sizes()[0];
+  const auto T = query.sizes()[1];
+  const auto dim_per_head = D / num_head;
+#endif
+
  // shape: [B, T, 3 x D]
  auto qkv = gemm_nt(query, qkv_weight);
+#ifndef NDEBUG
+  debug_assert_shape(qkv, {B, T, 3 * D});
+#endif

  // shape: 3 x [B, num_head, T, dim_per_head]
  auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  auto q = std::get<0>(q_k_v);
-  auto k = std::get<1>(q_k_v);
-  auto v = std::get<2>(q_k_v);
+  const auto& q = std::get<0>(q_k_v);
+  const auto& k = std::get<1>(q_k_v);
+  const auto& v = std::get<2>(q_k_v);
+#ifndef NDEBUG
+  debug_assert_shape(q, {B, num_head, T, dim_per_head});
+  debug_assert_shape(k, {B, num_head, T, dim_per_head});
+  debug_assert_shape(v, {B, num_head, T, dim_per_head});
+#endif

  // shape: [B, num_head, T, T]
  auto qkt = bmm_nt(q, k);
+#ifndef NDEBUG
+  debug_assert_shape(qkt, {B, num_head, T, T});
+#endif

  // shape: [B, num_head, T, T]
  masked_softmax_dropout(qkt, mask);

  // shape: [B, num_head, T, dim_per_head]
  auto attn_ctx = bmm_nn(qkt, v);
+#ifndef NDEBUG
+  debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
+#endif

  // shape: [B, T, D]
  auto attn = transform_0213(attn_ctx);
+#ifndef NDEBUG
+  debug_assert_shape(attn, {B, T, D});
+#endif

  // shape: [B, T, D]
  auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
+#ifndef NDEBUG
+  debug_assert_shape(proj, {B, T, D});
+#endif

  return proj;
 }
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -6061,7 +6061,7 @@
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
  variants: function, method

- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
+- func: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
  variants: function, method
  dispatch:
    CPU: scatter_reduce_two_cpu
@ -6867,7 +6867,8 @@
  dispatch:
    CPU: index_select_cpu_
    QuantizedCPU: index_select_quantized_cpu_
-    CUDA, QuantizedCUDA: index_select_cuda
+    CUDA: index_select_cuda
+    QuantizedCUDA: index_select_quantized_cuda
    SparseCPU: index_select_sparse
    SparseCUDA: index_select_sparse

--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -160,10 +160,9 @@ Tensor MakeStridedQTensorCPU(
      allocator->allocate(size_bytes),
      allocator,
      /* resizable = */ true);
-  constexpr auto quantized_cpu_ks = at::DispatchKeySet(at::DispatchKey::QuantizedCPU);
  auto tensor = detail::make_tensor<QTensorImpl>(
      storage,
-      quantized_cpu_ks,
+      at::DispatchKeySet(at::DispatchKey::QuantizedCPU),
      dtype,
      quantizer);
  get_qtensorimpl(tensor)->set_sizes_and_strides(sizes, strides);
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -2,7 +2,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/Activation.h>
-#include <ATen/native/SortingUtils.h>
+#include <ATen/native/TopKImpl.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/Loops.h>
--- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/WrapDimUtils.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/native/TensorIterator.h>
--- a/benchmarks/cpp/nvfuser/batch_norm.cpp
+++ b/benchmarks/cpp/nvfuser/batch_norm.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -44,8 +45,8 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {
    bias = castOp(DataType::Float, bias);
  }

-  auto momentum_ptr = new Double(kMomentum);
-  auto eps_ptr = new Double(kEps);
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);

  auto result = batch_norm(
      input,
--- a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp
+++ b/benchmarks/cpp/nvfuser/batch_norm_backward.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -49,7 +50,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
    grad_output = castOp(DataType::Float, grad_output);
  }

-  auto eps_ptr = new Double(kEps);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);

  auto result = batch_norm_backward(
      input,
--- a/benchmarks/cpp/nvfuser/bert.cpp
+++ b/benchmarks/cpp/nvfuser/bert.cpp
@ -2,6 +2,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -36,7 +37,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
  fusion->addInput(tv1);

  // TODO: should be input
-  auto d16 = new Double(1.0);
+  auto d16 = IrBuilder::create<Double>(1.0);

  if (is_fp16) {
    tv0 = castOp(DataType::Float, tv0);
@ -47,7 +48,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
  auto tv3 = add(tv2, tv0);

  auto tv10 = softmax(tv3, 3);
-  auto dropout_tvs = dropout(tv10, new Double(0.9));
+  auto dropout_tvs = dropout(tv10, IrBuilder::create<Double>(0.9));
  auto tv12 = dropout_tvs.mask;
  auto tv14 = dropout_tvs.output;

@ -83,9 +84,9 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
  }

  // TODO: should be inputs
-  auto d32 = new Double(1.0);
+  auto d32 = IrBuilder::create<Double>(1.0);
  // fusion->addInput(d32);
-  auto d33 = new Double(2.0);
+  auto d33 = IrBuilder::create<Double>(2.0);
  // fusion->addInput(d33);

  auto tv4 = mul(tv2, tv3);
@ -252,14 +253,15 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {

  auto tv5 = broadcast(tv4, {true, true, false});
  auto tv6 = add(tv3, tv5);
-  auto dropout_outs = dropout(tv6, new Double(0.9));
+  auto dropout_outs = dropout(tv6, IrBuilder::create<Double>(0.9));

  auto tv8 = dropout_outs.output;
  auto tv10 = dropout_outs.mask;

  auto tv11 = add(tv10, tv2);

-  auto layer_norm_outs = layer_norm(tv11, 1, tv0, tv1, new Double(1e-5));
+  auto layer_norm_outs =
+      layer_norm(tv11, 1, tv0, tv1, IrBuilder::create<Double>(1e-5));
  auto tv14 = layer_norm_outs.output;
  auto tv21 = layer_norm_outs.mean;
  auto tv26 = layer_norm_outs.invstd;
@ -481,7 +483,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
    tv1 = castOp(DataType::Float, tv1);
    tv8 = castOp(DataType::Float, tv8);
  }
-  auto d36 = mul(new Double(1.0), tv1->axis(2)->extent());
+  auto d36 = mul(IrBuilder::create<Double>(1.0), tv1->axis(2)->extent());
  auto d47 = unaryOp(UnaryOpType::Reciprocal, d36);

  auto tv9 = broadcast(tv5, {true, true, false});
@ -583,7 +585,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
  }

  // Uncertain this is the right value, but going for it anyways
-  auto d34 = div(new Double(1.0), tv0->axis(2)->extent());
+  auto d34 = div(IrBuilder::create<Double>(1.0), tv0->axis(2)->extent());

  auto tv25 = mul(tv21, tv0);
  auto tv26 = mul(tv25, d34);
--- a/benchmarks/cpp/nvfuser/gelu_backward.cpp
+++ b/benchmarks/cpp/nvfuser/gelu_backward.cpp
@ -4,6 +4,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>

@ -41,23 +42,23 @@ static void setupFusion(Fusion* fusion) {
  auto t5 = castOp(DataType::Float, t4);
  auto t6 = broadcast(t3, {true, true, false});
  auto t7 = add(t6, t5);
-  auto t8 = mul(t7, new Double(k_079));
-  auto t9 = mul(t7, new Double(k_004));
+  auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
+  auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
  auto t10 = mul(t9, t7);
-  auto t11 = add(t10, new Int(1));
+  auto t11 = add(t10, IrBuilder::create<Int>(1));
  auto t12 = mul(t8, t11);
  auto t13 = unaryOp(UnaryOpType::Tanh, t12);
-  auto t14 = mul(t7, new Double(0.5));
+  auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
  auto t15 = mul(t13, t13);
  auto t16 = unaryOp(UnaryOpType::Neg, t15);
-  auto t17 = add(t16, new Int(1));
-  auto t18 = mul(t7, new Double(k_010));
+  auto t17 = add(t16, IrBuilder::create<Int>(1));
+  auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
  auto t19 = mul(t18, t7);
-  auto t20 = add(t19, new Double(k_079));
+  auto t20 = add(t19, IrBuilder::create<Double>(k_079));
  auto t21 = mul(t17, t20);
  auto t22 = mul(t14, t21);
-  auto t23 = add(t13, new Int(1));
-  auto t24 = mul(t23, new Double(0.5));
+  auto t23 = add(t13, IrBuilder::create<Int>(1));
+  auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
  auto t25 = add(t22, t24);
  auto t26 = mul(t25, t1);

--- a/benchmarks/cpp/nvfuser/heuristic_cache.cpp
+++ b/benchmarks/cpp/nvfuser/heuristic_cache.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -129,7 +130,7 @@ static auto getLayerForwardNormRuntime(
  Fusion& fusion = *fusion_ptr.get();

  const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);

  auto input = makeSymbolicTensor(shape.size());
  fusion.addInput(input);
--- a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp
+++ b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -129,7 +130,7 @@ static auto getLayerForwardNormRuntime(
  Fusion& fusion = *fusion_ptr.get();

  const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);

  auto input = makeSymbolicTensor(shape.size());
  fusion.addInput(input);
--- a/benchmarks/cpp/nvfuser/instance_norm.cpp
+++ b/benchmarks/cpp/nvfuser/instance_norm.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
@ -39,8 +40,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) {
  const bool kTraining = true;
  const float kMomentum = 0.1;
  const float kEps = 1e-5;
-  auto momentum_ptr = new Double(kMomentum);
-  auto eps_ptr = new Double(kEps);
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);

  auto norm = instance_norm(
      input,
--- a/benchmarks/cpp/nvfuser/layer_norm.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -24,7 +25,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
  const int kReductionAxis = 1;
  const float kEps = 1e-5;

-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);

  // setup fusion
  auto input = makeContigTensor(2, dtype);
--- a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -22,7 +23,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);

  const int kReductionAxis = 1;
-  Double* eps_ptr = new Double(1e-5);
+  Double* eps_ptr = IrBuilder::create<Double>(1e-5);

  // setup fusion
  auto grad_out = makeContigTensor(2, dtype);
--- a/benchmarks/cpp/nvfuser/shape_inference.cpp
+++ b/benchmarks/cpp/nvfuser/shape_inference.cpp
@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -151,7 +152,7 @@ static auto getLayerForwardNormRuntime(
  Fusion& fusion = *fusion_ptr.get();

  const float kEps = 1e-5;
-  Double* eps_ptr = new Double(kEps);
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);

  auto input = makeSymbolicTensor(shape.size());
  fusion.addInput(input);
--- a/benchmarks/cpp/nvfuser/softmax_dropout.cpp
+++ b/benchmarks/cpp/nvfuser/softmax_dropout.cpp
@ -2,6 +2,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
@ -35,7 +36,7 @@ static void setupSoftmaxDropout(
  auto attention_scores = makeContigTensor(4, dtype);
  auto attention_mask = makeContigTensor(4, dtype);

-  Double* divisor = new Double();
+  Double* divisor = IrBuilder::create<Double>();

  fusion->addInput(attention_scores);
  fusion->addInput(attention_mask);
@ -49,8 +50,8 @@ static void setupSoftmaxDropout(
  attention_scores = div(attention_scores, divisor);
  attention_scores = add(attention_scores, attention_mask);
  auto attention_probs = softmax(attention_scores, kReductionAxis);
-  auto prob = new Double(kDropoutProbability);
-  auto scale = new Double(kScale);
+  auto prob = IrBuilder::create<Double>(kDropoutProbability);
+  auto scale = IrBuilder::create<Double>(kScale);
  auto dropout_results = dropout(attention_probs, prob, scale);
  auto output = dropout_results.output;

--- a/benchmarks/cpp/nvfuser/utils.cpp
+++ b/benchmarks/cpp/nvfuser/utils.cpp
@ -16,8 +16,8 @@ std::string toString(ReductionParams rparams) {
  if (rparams.schedule_3D) {
    ss << "3D Schedule // "
       << "Outer Reduction: "
-       << (rparams.cross_block_outer_reduce ? "cross block / " : "")
-       << (rparams.cross_grid_outer_reduce ? "cross grid / " : "")
+       << (rparams.cross_block_outer_reduction ? "cross block / " : "")
+       << (rparams.cross_grid_outer_reduction ? "cross grid / " : "")
       << (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : "");
    if (rparams.batches_per_block_outer_reduction > 1 ||
        rparams.persistent_kernel) {
@ -38,9 +38,9 @@ std::string toString(ReductionParams rparams) {
  }

  ss << " // Inner Reduction Domain: "
-     << (rparams.cross_block_inner_reduce ? "cross block reduction / " : "")
+     << (rparams.cross_block_inner_reduction ? "cross block reduction / " : "")
     << (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "")
-     << (rparams.cross_grid_inner_reduce ? "cross grid reduction / " : "");
+     << (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : "");

  if (rparams.batches_per_block_inner_reduction > 1 ||
      rparams.persistent_kernel) {
@ -48,7 +48,7 @@ std::string toString(ReductionParams rparams) {
       << " / ";
  }

-  ss << (rparams.cross_grid_inner_reduce &&
+  ss << (rparams.cross_grid_inner_reduction &&
                 rparams.split_grid_dim_inner_reduction
             ? "split grid dimension / "
             : "")
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@ -1,47 +1,14 @@
 #include <c10/core/DispatchKey.h>
-#include <c10/core/DispatchKeySet.h>

 #include <unordered_map>

 namespace c10 {

-const char* toString(BackendComponent t) {
-  switch (t) {
-    case BackendComponent::CPUBit:
-      return "CPUBit";
-    case BackendComponent::CUDABit:
-      return "CUDABit";
-    case BackendComponent::HIPBit:
-      return "HIPBit";
-    case BackendComponent::XLABit:
-      return "XLABit";
-    case BackendComponent::LazyBit:
-      return "LazyBit";
-    case BackendComponent::XPUBit:
-      return "XPUBit";
-    case BackendComponent::MLCBit:
-      return "MLCBit";
-    case BackendComponent::HPUBit:
-      return "HPUBit";
-    case BackendComponent::VEBit:
-      return "VEBit";
-    case BackendComponent::PrivateUse1Bit:
-      return "PrivateUse1Bit";
-    case BackendComponent::PrivateUse2Bit:
-      return "PrivateUse2Bit";
-    case BackendComponent::PrivateUse3Bit:
-      return "PrivateUse3Bit";
-    case BackendComponent::InvalidBit:
-      return "InvalidBit";
-    default:
-      return "UNKNOWN_BACKEND_BIT";
-  }
-}
-
 const char* toString(DispatchKey t) {
  switch (t) {
    case DispatchKey::Undefined:
      return "Undefined";
+
    case DispatchKey::CPU:
      return "CPU";
    case DispatchKey::CUDA:
@ -100,6 +67,8 @@ const char* toString(DispatchKey t) {

    case DispatchKey::Python:
      return "Python";
+    case DispatchKey::PythonTLSSnapshot:
+      return "PythonTLSSnapshot";

    case DispatchKey::PrivateUse1:
      return "PrivateUse1";
@ -134,6 +103,8 @@ const char* toString(DispatchKey t) {
      return "AutogradMLC";
    case DispatchKey::AutogradHPU:
      return "AutogradHPU";
+    case DispatchKey::AutogradNestedTensor:
+      return "AutogradNestedTensor";
    case DispatchKey::AutogradPrivateUse1:
      return "AutogradPrivateUse1";
    case DispatchKey::AutogradPrivateUse2:
@ -142,8 +113,6 @@ const char* toString(DispatchKey t) {
      return "AutogradPrivateUse3";
    case DispatchKey::AutogradOther:
      return "AutogradOther";
-    case DispatchKey::AutogradNestedTensor:
-      return "AutogradNestedTensor";

    case DispatchKey::ZeroTensor:
      return "ZeroTensor";
@ -201,15 +170,6 @@ const char* toString(DispatchKey t) {
    case DispatchKey::FuncTorchBatched:
      return "FuncTorchBatched";

-    case DispatchKey::Dense:
-      return "Dense";
-    case DispatchKey::Quantized:
-      return "Quantized";
-    case DispatchKey::Sparse:
-      return "Sparse";
-    case DispatchKey::AutogradFunctionality:
-      return "AutogradFunctionality";
-
    default:
      return "UNKNOWN_TENSOR_TYPE_ID";
  }
@ -218,39 +178,79 @@ const char* toString(DispatchKey t) {
 std::ostream& operator<<(std::ostream& str, DispatchKey rhs) {
  return str << toString(rhs);
 }
-std::ostream& operator<<(std::ostream& str, BackendComponent rhs) {
-  return str << toString(rhs);
-}

-DispatchKey getAutogradKeyFromBackend(BackendComponent k) {
-  // We want this to return an autograd key. We're relying on the fact that
-  // getAutogradRelatedKeySetFromBackend returns an autograd key +
-  // ADInplaceOrView, and autograd has higher precedence. The core mapping from
-  // backend -> autograd key lives in `getAutogradRelatedKeySetFromBackend`
-  // instead of here for performance. `getAutogradRelatedKeySetFromBackend` is a
-  // hotpath function, and we want to make sure that it doesn't have to
-  // construct any DispatchKeySets at runtime.
-  return getAutogradRelatedKeySetFromBackend(k).highestPriorityTypeId();
+// for a given backend key, return the associated autograd key.
+// for non-backend keys, return AutogradOther as a default.
+// Note: it's convenient and fast to return a default here rather than (say)
+// returning an optional<DispatchKey>, or throwing. But it makes callers
+// responsible for either a) enforcing the invariant that only backend keys
+// be passed as arguments, or b) interpreting our return value carefully.
+//
+DispatchKey getAutogradKeyFromBackend(DispatchKey t) {
+  switch (t) {
+    case DispatchKey::CPU:
+      return DispatchKey::AutogradCPU;
+    case DispatchKey::XPU:
+      return DispatchKey::AutogradXPU;
+    case DispatchKey::CUDA:
+      return DispatchKey::AutogradCUDA;
+    case DispatchKey::XLA:
+      return DispatchKey::AutogradXLA;
+    case DispatchKey::Lazy:
+      return DispatchKey::AutogradLazy;
+    case DispatchKey::MLC:
+      return DispatchKey::AutogradMLC;
+    case DispatchKey::HPU:
+      return DispatchKey::AutogradHPU;
+    case DispatchKey::NestedTensor:
+      return DispatchKey::AutogradNestedTensor;
+    case DispatchKey::PrivateUse1:
+      return DispatchKey::AutogradPrivateUse1;
+    case DispatchKey::PrivateUse2:
+      return DispatchKey::AutogradPrivateUse2;
+    case DispatchKey::PrivateUse3:
+      return DispatchKey::AutogradPrivateUse3;
+    default:
+      return DispatchKey::AutogradOther;
+  }
 }

 c10::DispatchKey parseDispatchKey(const std::string& k) {
  static std::unordered_map<std::string, c10::DispatchKey> key_map = {
      {"Undefined", c10::DispatchKey::Undefined},
-      {"Dense", c10::DispatchKey::Dense},
+      {"CPU", c10::DispatchKey::CPU},
+      {"CUDA", c10::DispatchKey::CUDA},
+      {"HIP", c10::DispatchKey::HIP},
      {"FPGA", c10::DispatchKey::FPGA},
      {"ORT", c10::DispatchKey::ORT},
+      {"XLA", c10::DispatchKey::XLA},
+      {"MLC", c10::DispatchKey::MLC},
      {"Vulkan", c10::DispatchKey::Vulkan},
      {"Metal", c10::DispatchKey::Metal},
+      {"XPU", c10::DispatchKey::XPU},
+      {"HPU", c10::DispatchKey::HPU},
      {"VE", c10::DispatchKey::VE},
+      {"Lazy", c10::DispatchKey::Lazy},
      {"Meta", c10::DispatchKey::Meta},
-      {"Quantized", c10::DispatchKey::Quantized},
+      {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
+      {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA},
+      {"QuantizedXPU", c10::DispatchKey::QuantizedXPU},
      {"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId},
      {"MkldnnCPU", c10::DispatchKey::MkldnnCPU},
-      {"Sparse", c10::DispatchKey::Sparse},
+      {"SparseCPU", c10::DispatchKey::SparseCPU},
+      {"SparseCUDA", c10::DispatchKey::SparseCUDA},
+      {"SparseHIP", c10::DispatchKey::SparseHIP},
+      {"SparseXPU", c10::DispatchKey::SparseXPU},
+      {"SparseVE", c10::DispatchKey::SparseVE},
      {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU},
      {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA},
+      {"NestedTensor", c10::DispatchKey::NestedTensor},
+      {"PrivateUse1", c10::DispatchKey::PrivateUse1},
+      {"PrivateUse2", c10::DispatchKey::PrivateUse2},
+      {"PrivateUse3", c10::DispatchKey::PrivateUse3},
      {"BackendSelect", c10::DispatchKey::BackendSelect},
      {"Python", c10::DispatchKey::Python},
+      {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot},
      {"Named", c10::DispatchKey::Named},
      {"Conjugate", c10::DispatchKey::Conjugate},
      {"Negative", c10::DispatchKey::Negative},
@ -259,8 +259,17 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       c10::DispatchKey::FuncTorchDynamicLayerBackMode},
      {"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView},
      {"AutogradOther", c10::DispatchKey::AutogradOther},
-      {"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality},
+      {"AutogradCPU", c10::DispatchKey::AutogradCPU},
+      {"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
+      {"AutogradXLA", c10::DispatchKey::AutogradXLA},
+      {"AutogradLazy", c10::DispatchKey::AutogradLazy},
+      {"AutogradXPU", c10::DispatchKey::AutogradXPU},
+      {"AutogradMLC", c10::DispatchKey::AutogradMLC},
+      {"AutogradHPU", c10::DispatchKey::AutogradHPU},
      {"AutogradNestedTensor", c10::DispatchKey::AutogradNestedTensor},
+      {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1},
+      {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2},
+      {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3},
      {"Tracer", c10::DispatchKey::Tracer},
      {"AutocastCPU", c10::DispatchKey::AutocastCPU},
      {"AutocastCUDA", c10::DispatchKey::AutocastCUDA},
@ -274,41 +283,6 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
      {"TESTING_ONLY_GenericWrapper",
       c10::DispatchKey::TESTING_ONLY_GenericWrapper},
      {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode},
-
-      {"CPU", c10::DispatchKey::CPU},
-      {"CUDA", c10::DispatchKey::CUDA},
-      {"HIP", c10::DispatchKey::HIP},
-      {"XLA", c10::DispatchKey::XLA},
-      {"MLC", c10::DispatchKey::MLC},
-      {"XPU", c10::DispatchKey::XPU},
-      {"HPU", c10::DispatchKey::HPU},
-      {"Lazy", c10::DispatchKey::Lazy},
-      {"NestedTensor", c10::DispatchKey::NestedTensor},
-      {"PrivateUse1", c10::DispatchKey::PrivateUse1},
-      {"PrivateUse2", c10::DispatchKey::PrivateUse2},
-      {"PrivateUse3", c10::DispatchKey::PrivateUse3},
-
-      {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
-      {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA},
-      {"QuantizedXPU", c10::DispatchKey::QuantizedXPU},
-
-      {"SparseCPU", c10::DispatchKey::SparseCPU},
-      {"SparseCUDA", c10::DispatchKey::SparseCUDA},
-      {"SparseHIP", c10::DispatchKey::SparseHIP},
-      {"SparseXPU", c10::DispatchKey::SparseXPU},
-      {"SparseVE", c10::DispatchKey::SparseVE},
-
-      {"AutogradCPU", c10::DispatchKey::AutogradCPU},
-      {"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
-      {"AutogradXLA", c10::DispatchKey::AutogradXLA},
-      {"AutogradLazy", c10::DispatchKey::AutogradLazy},
-      {"AutogradXPU", c10::DispatchKey::AutogradXPU},
-      {"AutogradMLC", c10::DispatchKey::AutogradMLC},
-      {"AutogradHPU", c10::DispatchKey::AutogradHPU},
-      {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1},
-      {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2},
-      {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3},
-
      {"Autograd", c10::DispatchKey::Autograd},
      {"CompositeImplicitAutograd",
       c10::DispatchKey::CompositeImplicitAutograd},
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@ -9,98 +9,20 @@

 namespace c10 {

-// Semantically, each value of BackendComponent identifies a "backend" for our
-// dispatch. Some functionalities that we may dispatch to are allowed to
-// register different handlers for each backend. The BackendComponent is then
-// used to figure out which backend implementation to dispatch to.
-
-// In implementation terms, the backend component identifies a specific "bit" in
-// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
-// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
-// functionalities. When we encounter a functionality bit that is known to be
-// customizeable per-backend, then we also look at the lower BackendComponent
-// bits and take the highest bit to determine which backend's implementation to
-// use.
-
-enum class BackendComponent : uint8_t {
-
-  // A "backend" is colloquially used to refer to handlers for dispatch
-  // which actually implement the numerics of an operation in question.
-  //
-  // Due to the nature of the enum, these backends are specified in
-  // an ordered way, but for most backends this order is not semantically
-  // meaningful (e.g., it's valid to reorder these backends without changing
-  // semantics).  The only situation when backend ordering is meaningful
-  // is when the backend participates in multiple dispatch with another
-  // backend; e.g., CPU and CUDA (cuda must have higher priority).
-
-  // These keys don't correspond to individual kernels.
-  // Instead, they represent the backends that are allowed to override specific
-  // pieces of functionality:
-  // - dense kernels (e.g. DispatchKey::CPU)
-  // - sparse kernels (e.g. DispatchKey::SparseCPU)
-  // - quantized kernels (e.g. DispatchKey::QuantizedCPU)
-  // - autograd kernels (e.g. DispatchKey::AutogradCPU)
-  // We reserve space in the runtime operator table for this full cross product
-  // of
-  // [backends in this enum] x [keys below that are explicitly marked as having
-  // per-backend functionality]
-
-  InvalidBit = 0,
-  CPUBit,
-  CUDABit,
-  HIPBit,
-  XLABit,
-  MLCBit,
-  XPUBit,
-  HPUBit,
-  VEBit,
-  LazyBit,
-  PrivateUse1Bit,
-  PrivateUse2Bit,
-  PrivateUse3Bit,
-  // Define an alias to represent end of backend dispatch keys.
-  // If you add new backend keys after PrivateUse3, please also update it here.
-  // (But you shouldn't: private use keys should have higher precedence than
-  // all built-in keys)
-  EndOfBackendKeys = PrivateUse3Bit,
-};
-
 // Semantically, a dispatch key identifies a possible "level" in our
-// dispatch, for which a handler may be registered. Each handler corresponds
-// to a type of functionality.
+// dispatch, for which a handler may be registered.  Traditional
+// backends like CPU and CUDA get dispatch keys; however, so do
+// "wrapping" layers like Variable (for autograd handling).
 //
 // In implementation terms, the dispatch key identifies a specific "bit" in a
 // DispatchKeySet.  Higher bit indexes get handled by dispatching first (because
 // we "count leading zeros" when we extract the highest priority dispatch
 // key.)
 //
-// Note [DispatchKey Classification]
-// This enum actually contains several types of keys, which are explained
-// in more detail further down:
-// (1) non-customizable backends (e.g. FPGA)
-// (2) non-customizable functionalities (e.g. Functionalize)
-// (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
-// AutogradFunctionality) (4) per-backend instances of customizable
-// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
-// CompositeImplicitAutograd)
-//
-// Of the categories above, it's important to note:
-// (a) which keys are assigned individual bits in a DispatchKeySet
-// (b) which keys are assigned individual slots in the runtime operator table
-// ("Runtime keys")
-//
-// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
-// (1), (2) and (4) all get their own dedicated slots in the runtime operator
-// table.
-
-// See Note [DispatchKeySet Internal Representation] for more details.
-//
 // NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py
-enum class DispatchKey : uint16_t {
-
+enum class DispatchKey : uint8_t {
  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-  // This is not a "real" functionality, but it exists to give us a "nullopt"
+  // This is not a "real" tensor id, but it exists to give us a "nullopt"
  // element we can return for cases when a DispatchKeySet contains no elements.
  // You can think a more semantically accurate definition of DispatchKey is:
  //
@ -116,31 +38,24 @@ enum class DispatchKey : uint16_t {
  // this will get eliminated, but for now it's convenient)
  CatchAll = Undefined,

-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
-  // Every value in the enum (up to EndOfFunctionalityKeys)
-  // corresponds to an individual "functionality" that can be dispatched to.
-  // This is represented in the DispatchKeySet by assigning each of these enum
-  // values
-  // to each of the remaining (64 - len(BackendComponent)) bits.
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // A "backend" is colloquially used to refer to handlers for dispatch
+  // which actually implement the numerics of an operation in question.
  //
-  // Most of these functionalities have a single handler assigned to them,
-  // making them "runtime keys".
-  // That map to a single slot in the runtime operator table.
-  //
-  // A few functionalities are allowed to be customizable per backend.
-  // See [Note: Per-Backend Functionality Dispatch Keys] for details.
-
-  // See [Note: Per-Backend Functionality Dispatch Keys]
-  Dense,
-
-  // Below are non-extensible backends.
-  // These are backends that currently don't have their own overrides for
-  // Autograd/Sparse/Quantized kernels,
-  // and we therefore don't waste space in the runtime operator table allocating
-  // space for them.
-  // If any of these backends ever need to customize, e.g., Autograd, then we'll
-  // need to add a DispatchKey::*Bit for them.
+  // Due to the nature of the enum, these backends are specified in
+  // an ordered way, but for most backends this order is not semantically
+  // meaningful (e.g., it's valid to reorder these backends without changing
+  // semantics).  The only situation when backend ordering is meaningful
+  // is when the backend participates in multiple dispatch with another
+  // backend; e.g., CPU and SparseCPU (sparse must have
+  // higher priority).

+  // Here are backends which you think of as traditionally specifying
+  // how to implement operations on some device.
+  CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp
+  CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp
+  HIP, // NB: I think this is not actually used, due to Note [Masquerading as
+  // CUDA]
  FPGA, // Xilinx support lives out of tree at
  // https://gitlab.com/pytorch-complex/vitis_kernels

@ -152,8 +67,14 @@ enum class DispatchKey : uint16_t {
  // - aten/src/ATen/test/extension_backend_test.cpp
  ORT,

+  XLA, // lives out of tree at https://github.com/pytorch/xla
+  MLC, // lives out of tree at https://github.com/pytorch/MLCompute
  Vulkan,
  Metal,
+  XPU, // For out of tree Intel's heterogeneous computing plug-in
+  HPU, // For out of tree & closed source integration of HPU / Habana
+  VE, // For out of tree & closed source integration of SX-Aurora / NEC
+  Lazy, // For lazy tensor backends

  // A meta tensor is a tensor without any data associated with it.  (They
  // have also colloquially been referred to as tensors on the "null" device).
@ -162,8 +83,11 @@ enum class DispatchKey : uint16_t {
  // tensor with the output shape and dtype, but wouldn't actually add anything.
  Meta,

-  // See [Note: Per-Backend Functionality Dispatch Keys]
-  Quantized,
+  // Here are backends which specify more specialized operators
+  // based on the dtype of the tensor.
+  QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp
+  QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp
+  QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in

  // This backend is to support custom RNGs; it lets you go
  // to a different kernel if you pass in a generator that is not a
@ -182,29 +106,31 @@ enum class DispatchKey : uint16_t {
  // the corresponding dense tensors, and must be handled before them.
  MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
  // NB: not to be confused with MKLDNN, which is Caffe2 only
-
-  // See [Note: Per-Backend Functionality Dispatch Keys]
-  Sparse,
+  SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp
+  SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp
+  SparseHIP, // TODO: I think this is not actually used, due to Note
+  // [Masquerading as CUDA]
+  SparseXPU, // For out of tree Intel's heterogeneous computing plug-in
+  SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC

  SparseCsrCPU,
  SparseCsrCUDA,

-  // Note [Non-Customizable Backend Keys]
-  // Every key above here is considered a "non-customizable backend".
-  // These are backends that will work correctly with autograd, but
-  // but currently don't require separate implementations
-  // for autograd sparse or quantized kernels.
-  // Any new backends that don't need to be customized should go above here.
-  // If an existing backend needs to e.g. override autograd, then we can
-  // consider promoting it into the "BackendComponent" enum
-  //
-  // For all intents and purposes from the perspective of DispatchKeySet,
-  // "non-customizable backend" keys are treated the same way
-  // as other functionality keys
-  EndOfNonCustomizableBackends = SparseCsrCUDA,
-
  NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor

+  // Here are reserved backends for user-defined backends, see Note [Private use
+  // DispatchKey]
+  // To see some example about how to use this, check out ORT
+  PrivateUse1,
+  PrivateUse2,
+  PrivateUse3,
+
+  // Define an alias key to represent end of backend dispatch keys.
+  // If you add new backend keys after PrivateUse3, please also update it here.
+  // (But you shouldn't: private use keys should have higher precedence than
+  // all built-in keys)
+  EndOfBackendKeys = PrivateUse3,
+
  // In some situations, it is not immediately obvious what the correct
  // backend for function is, because the function in question doesn't
  // have any "tensor" arguments.  In this case, a BackendSelect function
@ -307,18 +233,20 @@ enum class DispatchKey : uint16_t {
  // AutogradOther key. We can add specific autograd key for those backends
  // upon request.
  AutogradOther,
-
-  // See [Note: Per-Backend Functionality Dispatch Keys]
-  AutogradFunctionality,
-
-  // NestedTensor is an example of something that isn't a "real backend"
-  // (because it mostly consists of redispatching kernels)
-  // but it would like to override autograd functionality in C++.
-  // We can handle cases like this by adding an extra functionality key
-  // exclusively for handling autograd for NestedTensor.
-  // lives out of tree at
+  AutogradCPU,
+  AutogradCUDA,
+  AutogradXLA,
+  AutogradLazy,
+  AutogradXPU,
+  AutogradMLC,
+  AutogradHPU,
+  AutogradNestedTensor, // lives out of tree at
  // https://github.com/pytorch/nestedtensor
-  AutogradNestedTensor,
+  // Here are some reserved pre-autograd keys for user-defined backends, see
+  // Note [Private use DispatchKey]
+  AutogradPrivateUse1,
+  AutogradPrivateUse2,
+  AutogradPrivateUse3,

  Tracer,

@ -354,6 +282,11 @@ enum class DispatchKey : uint16_t {
  Functionalize,
  FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype]

+  // Used by Python key logic to know the set of tls on entry to the dispatcher
+  // This kernel assumes it is at the very top of the dispatcher. If you add
+  // a key above, make sure to update the fallback implementation for this.
+  PythonTLSSnapshot,
+
  // TESTING: This is intended to be a generic testing tensor type id.
  // Don't use it for anything real; its only acceptable use is within a single
  // process test.  Use it by creating a TensorImpl with this DispatchKey, and
@ -371,100 +304,9 @@ enum class DispatchKey : uint16_t {
  TESTING_ONLY_GenericMode,

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-  EndOfFunctionalityKeys, // End of functionality keys.
-
-  // ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ //
-  // Here are backends which you think of as traditionally specifying
-  // how to implement operations on some device.
-
-  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-  StartOfDenseBackends,
-  CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp
-  CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp
-  HIP, // NB: I think this is not actually used, due to Note [Masquerading as
-  // CUDA]
-  XLA, // lives out of tree at https://github.com/pytorch/xla
-  MLC, // lives out of tree at https://github.com/pytorch/MLCompute
-  XPU, // For out of tree Intel's heterogeneous computing plug-in
-  HPU, // For out of tree & closed source integration of HPU / Habana
-  VE, // For out of tree & closed source integration of SX-Aurora / NEC
-  Lazy, // For lazy tensor backends
-  // Here are reserved backends for user-defined backends, see Note [Private use
-  // DispatchKey]
-  // To see some example about how to use this, check out ORT
-  PrivateUse1,
-  PrivateUse2,
-  PrivateUse3,
-  EndOfDenseBackends = PrivateUse3,
-
-  // ~~~~~~~~~~~~~~ "Quantized" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~ //
-  // keys starting with an _ are not currently used,
-  // but are needed to ensure that every backend is indexed correctly.
-
-  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-  StartOfQuantizedBackends,
-  QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp
-  QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp
-  _QuantizedHIP,
-  _QuantizedXLA,
-  _QuantizedMLC,
-  QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in
-  _QuantizedHPU,
-  _QuantizedVE,
-  _QuantizedLazy,
-  _QuantizedPrivateUse1,
-  _QuantizedPrivateUse2,
-  _QuantizedPrivateUse3,
-  EndOfQuantizedBackends = _QuantizedPrivateUse3,
-
-  // ~~~~~~~~~~~~~~ "Sparse" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ //
-  // keys starting with an _ are not currently used,
-  // but are needed to ensure that every backend is indexed correctly.
-
-  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-  StartOfSparseBackends,
-  SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp
-  SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp
-  SparseHIP, // TODO: I think this is not actually used, due to Note
-  // [Masquerading as CUDA]
-  _SparseXLA,
-  _SparseMLC,
-  SparseXPU, // For out of tree Intel's heterogeneous computing plug-in
-  _SparseHPU,
-  SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC
-  _SparseLazy,
-  _SparsePrivateUse1,
-  _SparsePrivateUse2,
-  _SparsePrivateUse3,
-  EndOfSparseBackends = _SparsePrivateUse3,
-
-  // ~~~~~~~~~~~~~~ "Autograd" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~ //
-  // keys starting with an _ are not currently used,
-  // but are needed to ensure that every backend is indexed correctly.
-
-  // See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-  StartOfAutogradBackends,
-  AutogradCPU,
-  AutogradCUDA,
-  _AutogradHIP,
-  AutogradXLA,
-  AutogradMLC,
-  AutogradXPU,
-  AutogradHPU,
-  _AutogradVE,
-  AutogradLazy,
-  // Here are some reserved pre-autograd keys for user-defined backends, see
-  // Note [Private use DispatchKey]
-  AutogradPrivateUse1,
-  AutogradPrivateUse2,
-  AutogradPrivateUse3,
-  EndOfAutogradBackends = AutogradPrivateUse3,
-  // If we add a new per-backend functionality key that has higher priority
-  // than Autograd, then this key should be updated.
-  EndOfRuntimeBackendKeys = EndOfAutogradBackends,
+  NumDispatchKeys, // Sentinel, end of runtime keys.

  // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ //
-  // Note [Alias Dispatch Keys]
  // Alias dispatch keys are synthetic dispatch keys which map to multiple
  // runtime dispatch keys. Alisa keys have precedence, but they are always
  // lower precedence than runtime keys. You can register a kernel to an
@ -484,7 +326,6 @@ enum class DispatchKey : uint16_t {

  // Define an alias key to represent end of alias dispatch keys.
  // If you add new alias keys after Autograd, please also update it here.
-  StartOfAliasKeys = Autograd,
  EndOfAliasKeys = CompositeExplicitAutograd, //

  // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
@ -524,83 +365,54 @@ enum class DispatchKey : uint16_t {
 // built-in autograd formulas for operators are not appropriate.

 static_assert(
-    (static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) +
-     static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys)) <= 64,
-    "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)"
-    " both map to backend and functionality bits"
-    " into a 64-bit bitmask; you must have less than 64 total entries between them");
-
-// Check if a DispatchKey is an alias mapping to other runtime keys.
-constexpr bool isAliasDispatchKey(DispatchKey k) {
-  return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys;
-}
-
-// [Note: Per-Backend Functionality Dispatch Keys]
-// Check if a DispatchKey is a per-backend functionality key
-// Any functionalities that can be customized per-backend should be added here.
-// These keys correspond to functionalities that can be customized indivually
-// per backend. While they only take up one bit in the `DispatchKeySet` bitset,
-// they map to (# backends) slots in the operator table.
-// Each of these keys also has a separate set of "runtime keys" in the dispatch
-// key enum, per backend, which *do* map to the individual operator table slots.
-// For example, the "Sparse" key maps to an individual bit in the
-// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual
-// slots in the runtime operator table.
-
-constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
-  if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
-      k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Note that this includes Undefined in the total count.
-// BUT EndOfFunctionalityKeys is its own (placeholder) key.
-// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3.
-// In the above example, there are 3 total functionality keys.
-constexpr uint8_t num_functionality_keys =
-    static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys);
-
-// Note [No More Than 16 Backends]
-// Search for this note to find places in the code where the "no more than 16
-// backends" invariant is baked in.
-static_assert(
-    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) <= 16,
-    "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \
-there are a few places where this invariant is baked in");
-
-constexpr uint8_t numPerBackendFunctionalityKeys() {
-  uint8_t count = 0;
-  for (uint8_t k = 0; k <= num_functionality_keys; ++k) {
-    if (isPerBackendFunctionalityKey(static_cast<DispatchKey>(k)))
-      ++count;
-  }
-  return count;
-}
+    static_cast<uint8_t>(DispatchKey::NumDispatchKeys) <= 64,
+    "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries");

 #if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
-// See [Note: Trimmed Mobile Dispatch Keys]
-constexpr uint8_t num_backends = 1; // Only CPU
-constexpr uint16_t num_runtime_entries = 8;
+/**
+ * The method below maps the dispatch key in the enum DispatchKey to an
+ * integer index in the dispatchTable_ array in OperatorEntry. The array
+ * is trimmed for mobile to reduce peak memory usage since it's
+ * unnecessary to reserve additional space for dispatch keys that will
+ * never be used on mobile.
+ */
+C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) {
+  switch (dk) {
+    case DispatchKey::Undefined:
+      return 0;
+    case DispatchKey::CPU:
+      return 1;
+    case DispatchKey::QuantizedCPU:
+      return 2;
+    case DispatchKey::SparseCPU:
+      return 3;
+    case DispatchKey::BackendSelect:
+      return 4;
+    case DispatchKey::ADInplaceOrView:
+      return 5;
+    case DispatchKey::AutogradOther:
+      return 6;
+    case DispatchKey::AutogradCPU:
+      return 7;
+    case DispatchKey::NumDispatchKeys: // Sentinel, end of runtime keys.
+      return 8;
+    default:
+      return -1;
+  }
+}
 #else
-constexpr uint8_t num_backends =
-    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys);
-constexpr uint16_t num_runtime_entries = num_functionality_keys +
-    (numPerBackendFunctionalityKeys() * (num_backends - 1));
+/**
+ * For the server use-case, make this a simple pass-through.
+ */
+C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) {
+  return static_cast<int>(dk);
+}
 #endif

-// See Note [No More Than 16 Backends]
-constexpr uint16_t full_backend_mask =
-    (static_cast<uint16_t>(1) << num_backends) - 1;
-
 C10_API const char* toString(DispatchKey);
-C10_API const char* toString(BackendComponent);
 C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
-C10_API std::ostream& operator<<(std::ostream&, BackendComponent);

-C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
+C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t);

 // Parses a string into a dispatch key.
 // If the string cannot be correctly parsed, throws an exception.
@ -613,86 +425,10 @@ C10_API c10::DispatchKey parseDispatchKey(const std::string& k);
 // torch::dispatch(torch::kCPU, ...) is also valid.
 constexpr DispatchKey kAutograd = DispatchKey::Autograd;

-// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-// This function relies on the invariant that the dispatch keys between
-// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
-// in the same order as `BackendComponent`.
-constexpr BackendComponent toBackendComponent(DispatchKey k) {
-  if (k >= DispatchKey::StartOfDenseBackends &&
-      k <= DispatchKey::EndOfDenseBackends) {
-    return static_cast<BackendComponent>(
-        static_cast<uint8_t>(k) -
-        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends));
-  } else if (
-      k >= DispatchKey::StartOfQuantizedBackends &&
-      k <= DispatchKey::EndOfQuantizedBackends) {
-    return static_cast<BackendComponent>(
-        static_cast<uint8_t>(k) -
-        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends));
-  } else if (
-      k >= DispatchKey::StartOfSparseBackends &&
-      k <= DispatchKey::EndOfSparseBackends) {
-    return static_cast<BackendComponent>(
-        static_cast<uint8_t>(k) -
-        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
-  } else if (
-      k >= DispatchKey::StartOfAutogradBackends &&
-      k <= DispatchKey::EndOfAutogradBackends) {
-    return static_cast<BackendComponent>(
-        static_cast<uint8_t>(k) -
-        static_cast<uint8_t>(DispatchKey::StartOfAutogradBackends));
-  } else {
-    return BackendComponent::InvalidBit;
-  }
+// Check if a DispatchKey is an alias mapping to other runtime keys.
+inline bool isAliasDispatchKey(DispatchKey k) {
+  return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys;
 }
-
-constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
-  if (k <= DispatchKey::EndOfFunctionalityKeys) {
-    return k;
-  } else if (k <= DispatchKey::EndOfDenseBackends) {
-    return DispatchKey::Dense;
-  } else if (k <= DispatchKey::EndOfQuantizedBackends) {
-    return DispatchKey::Quantized;
-  } else if (k <= DispatchKey::EndOfSparseBackends) {
-    return DispatchKey::Sparse;
-  } else if (k <= DispatchKey::EndOfAutogradBackends) {
-    return DispatchKey::AutogradFunctionality;
-  } else {
-    return DispatchKey::Undefined;
-  }
-}
-
-// Given (DispatchKey::Dense, DispatchKey::CUDABit), returns DispatchKey::CUDA
-// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-// This function relies on the invariant that the dispatch keys between
-// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
-// in the same order as `BackendComponent`.
-constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
-    DispatchKey functionality_k,
-    BackendComponent backend_k) {
-  if (functionality_k == DispatchKey::Dense) {
-    return static_cast<DispatchKey>(
-        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends) +
-        static_cast<uint8_t>(backend_k));
-  }
-  if (functionality_k == DispatchKey::Sparse) {
-    return static_cast<DispatchKey>(
-        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
-        static_cast<uint8_t>(backend_k));
-  }
-  if (functionality_k == DispatchKey::Quantized) {
-    return static_cast<DispatchKey>(
-        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
-        static_cast<uint8_t>(backend_k));
-  }
-  if (functionality_k == DispatchKey::AutogradFunctionality) {
-    return static_cast<DispatchKey>(
-        static_cast<uint8_t>(DispatchKey::StartOfAutogradBackends) +
-        static_cast<uint8_t>(backend_k));
-  }
-  return DispatchKey::Undefined;
-}
-
 } // namespace c10

 namespace torch {
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@ -1,29 +1,37 @@
 #include <c10/core/DispatchKeySet.h>
-#include <c10/util/irange.h>

 namespace c10 {

-// backend_dispatch_keyset includes all dispatch keys that map to backends.
+// backend_dispatch_keyset should include all runtime backend keys.
 // Alias key DispatchKey::CompositeExplicitAutograd maps to
-// backend_dispatch_keyset
-constexpr DispatchKeySet backend_dispatch_keyset =
-    autogradother_backends | DispatchKeySet(DispatchKey::Dense);
+// backend_dispatch_keyset NestedTensor has been explicitly removed due to
+// incompatibility with some kernels, such as structured kernels, that use the
+// DefaultBackend key.
+constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends |
+    DispatchKeySet({
+        DispatchKey::CPU,
+        DispatchKey::CUDA,
+        DispatchKey::XLA,
+        DispatchKey::Lazy,
+        DispatchKey::XPU,
+        DispatchKey::PrivateUse1,
+        DispatchKey::PrivateUse2,
+        DispatchKey::PrivateUse3,
+        DispatchKey::MLC,
+        DispatchKey::HPU,
+        DispatchKey::ORT,
+        DispatchKey::Meta,
+    });

 bool isBackendDispatchKey(DispatchKey t) {
  return t != DispatchKey::Undefined
      // See Note [No Alias Keys in DispatchKeySet]
-      && !isAliasDispatchKey(t)
-      // Note [NestedTensor Not Included in Backend Keys]
-      // NestedTensor has been explicitly removed from the "backend keyset" due
-      // to incompatibility with some kernels, so we don't want it to be
-      // included in CompositeImplicitAutograd or CompositeExplicitAutograd
-      // kernels.
-      && t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t);
+      && !isAliasDispatchKey(t) && backend_dispatch_keyset.has(t);
 }

 // math_dispatch_keyset contains all keys in backend_dispatch_keyset and
 // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd
-// maps to [math_dispatch_keyset x full_backend_mask]
+// maps to math_dispatch_keyset.
 constexpr DispatchKeySet math_dispatch_keyset =
    backend_dispatch_keyset | autograd_dispatch_keyset;

@ -31,12 +39,7 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) {
  TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
  switch (t) {
    case DispatchKey::Autograd:
-      // See Note [autograd_dispatch_keyset Does Not Include Backend Bits]
-      // That's why we OR it with a mask of the backend bits here.
-      // getRuntimeDispatchKeySet() expects to return a keyset of runtime
-      // dispatch keys, like AutogradCPU, but that requires having backend bits.
-      return autograd_dispatch_keyset |
-          DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+      return autograd_dispatch_keyset;
    case DispatchKey::CompositeImplicitAutograd:
      return math_dispatch_keyset;
    case DispatchKey::CompositeExplicitAutograd:
@ -50,13 +53,11 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) {
  TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined);
  switch (t) {
    case DispatchKey::Autograd:
-      return autograd_dispatch_keyset.has(toFunctionalityKey(k));
+      return autograd_dispatch_keyset.has(k);
    case DispatchKey::CompositeImplicitAutograd:
-      // See Note [NestedTensor Not Included in Backend Keys]
-      return k != DispatchKey::NestedTensor && math_dispatch_keyset.has(k);
+      return math_dispatch_keyset.has(k);
    case DispatchKey::CompositeExplicitAutograd:
-      // See Note [NestedTensor Not Included in Backend Keys]
-      return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k);
+      return backend_dispatch_keyset.has(k);
    default:
      return t == k;
  }
@ -78,6 +79,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
      return DispatchKeySet(DispatchKey::MLC);
    case DispatchKey::AutogradHPU:
      return DispatchKeySet(DispatchKey::HPU);
+    case DispatchKey::AutogradNestedTensor:
+      return DispatchKeySet(DispatchKey::NestedTensor);
    case DispatchKey::AutogradXPU:
      return DispatchKeySet(DispatchKey::XPU);
    case DispatchKey::AutogradPrivateUse1:
@ -93,6 +96,23 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
  }
 }

+DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) {
+  switch (t) {
+    case DispatchKey::CPU:
+      return DispatchKeySet(DispatchKey::AutocastCPU);
+    case DispatchKey::CUDA:
+    case DispatchKey::XLA:
+      return DispatchKeySet(DispatchKey::AutocastCUDA);
+    default:
+      return DispatchKeySet();
+  }
+}
+
+DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t) {
+  return DispatchKeySet(
+      {DispatchKey::ADInplaceOrView, getAutogradKeyFromBackend(t)});
+}
+
 bool isIncludedInAlias(DispatchKey k, DispatchKey alias) {
  return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k);
 }
@ -109,167 +129,18 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
    return os;
  }
  os << "DispatchKeySet(";
+  DispatchKey tid;
  bool first = true;
-  for (auto k : ts) {
+  while ((tid = ts.highestPriorityTypeId()) != DispatchKey::Undefined) {
    if (!first) {
      os << ", ";
    }
-    os << k;
+    os << tid;
+    ts = ts.remove(tid);
    first = false;
  }
  os << ")";
  return os;
 }

-DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() {
-  TORCH_INTERNAL_ASSERT(next_functionality_ >= num_backends);
-  TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val);
-  TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends);
-
-  // Create a masked version of the set representation to ignore previous
-  // keys that we've iterated through.
-  uint64_t masked_functionality_bits =
-      llvm::maskTrailingZeros<uint64_t>(next_functionality_) & *data_ptr_;
-  uint64_t masked_backend_bits =
-      llvm::maskTrailingZeros<uint64_t>(next_backend_) & full_backend_mask &
-      *data_ptr_;
-
-  uint64_t first_functionality_idx =
-      llvm::findFirstSet(masked_functionality_bits);
-  uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits);
-
-  // If there are no keys, set to end iterator value
-  if (first_functionality_idx == std::numeric_limits<uint64_t>::max() ||
-      next_functionality_ == iterator::end_iter_mask_val) {
-    // Set up state to be the same as end()
-    next_functionality_ = iterator::end_iter_mask_val;
-    current_dispatchkey_idx_ = iterator::end_iter_key_val;
-    next_backend_ = 0;
-    current_backendcomponent_idx_ = iterator::end_iter_key_val;
-    return *this;
-  }
-
-  // The +1 is because of DispatchKey::Undefined and
-  // BackendComponent::InvalidBit
-  auto new_next_functionality = first_functionality_idx + 1;
-  auto new_backendcomponent_idx = first_backendcomponent_idx + 1;
-  // and the -num_backends is because the first <num_backends> bits in the
-  // keyset are not Dispatch Keys.
-  auto next_dispatchkey_idx = new_next_functionality - num_backends;
-
-  // If the current functionality bit is a per-backend bit, we need special
-  // handling
-  if (isPerBackendFunctionalityKey(
-          static_cast<DispatchKey>(next_dispatchkey_idx))) {
-    // case 1: if the current backend is undefined, then there is no valid
-    // backend instance of this functionality key so we can skip it.
-    if (first_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
-      // increment the functionality mask so we skip the current functionality
-      // bit on the next increment.
-      next_functionality_ = new_next_functionality;
-      ++(*this);
-      return *this;
-    }
-
-    // Otherwise, at this point we know what the current backend and
-    // functionality bits are.
-    current_dispatchkey_idx_ = next_dispatchkey_idx;
-    current_backendcomponent_idx_ = new_backendcomponent_idx;
-
-    // Next, we need to set up the masks for the next increment.
-    uint64_t next_backendcomponent_bits =
-        llvm::maskTrailingZeros<uint64_t>(first_backendcomponent_idx + 1) &
-        full_backend_mask & *data_ptr_;
-    uint64_t next_backendcomponent_idx =
-        llvm::findFirstSet(next_backendcomponent_bits);
-    if (next_backendcomponent_idx == std::numeric_limits<uint64_t>::max()) {
-      // case 2: the current backend is valid, but there is not another backend
-      // in the keyset. In this case, we need to bump the functionality mask and
-      // reset the backend mask for the next increment
-      next_functionality_ = new_next_functionality;
-      next_backend_ = 0;
-    } else {
-      // case 3: we have another backend to iterate over. We want to iterate
-      // over the same functionality bit next time, but a different backend bit.
-      next_backend_ = first_backendcomponent_idx + 1;
-    }
-  } else {
-    // Functionality bits that aren't per backend are simpler to handle. We can
-    // ignore the backend bits.
-    TORCH_INTERNAL_ASSERT(next_backend_ == 0);
-    current_dispatchkey_idx_ = next_dispatchkey_idx;
-    next_functionality_ = new_next_functionality;
-  }
-  return *this;
-}
-
-std::array<FunctionalityOffsetAndMask, num_functionality_keys>
-initializeFunctionalityOffsetsAndMasks() {
-  std::array<FunctionalityOffsetAndMask, num_functionality_keys>
-      offsets_and_masks;
-  // manualy set the first entry, which corresponds to Undefined.
-  offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0);
-  // loop through every functionality key (aside from Undefined).
-  for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) {
-    // functionality_idx should be Dense -> 1, ...
-    auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1];
-    auto k = static_cast<DispatchKey>(functionality_idx);
-
-#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
-    // [Note: Trimmed Mobile Dispatch Keys]
-    uint16_t mask = 0;
-    uint16_t offset = 0;
-    switch (k) {
-      case DispatchKey::Undefined:
-        offset = 0;
-      case DispatchKey::CPU:
-        offset = 1;
-      case DispatchKey::QuantizedCPU:
-        offset = 2;
-      case DispatchKey::SparseCPU:
-        offset = 3;
-      case DispatchKey::BackendSelect:
-        offset = 4;
-      case DispatchKey::ADInplaceOrView:
-        offset = 5;
-      case DispatchKey::AutogradOther:
-        offset = 6;
-      case DispatchKey::AutogradCPU:
-        offset = 7;
-      default:
-        // All other keys which are unsupported on mobile will get sent
-        // to the undefined kernel, causing them to error.
-        offset = 0;
-    }
-    offsets_and_masks[functionality_idx] =
-        FunctionalityOffsetAndMask(offset, 0);
-  }
-#else
-    // If the previous functionality was not per-backend, then we can just
-    // increment the previous offset. Otherwise, the next offset =
-    // previous_offset + num_backends.
-    auto next_offset = prev_offset_and_mask.offset +
-        (prev_offset_and_mask.mask == 0 ? 1 : num_backends);
-    // the mask is used in the runtime index calculation to find the offset of
-    // the backend. For non-per-backend functionalities, this offset should
-    // always be 0. Otherwise, we need to get the index of the backend (which we
-    // can do using a backend mask).
-    auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0;
-    offsets_and_masks[functionality_idx] =
-        FunctionalityOffsetAndMask(next_offset, next_mask);
-  }
-  // Sanity check that the computed offset index of the last functionality key
-  // is correct. This assumes that the highest priority functionality key is not
-  // per backend.
-  TORCH_INTERNAL_ASSERT(
-      offsets_and_masks[num_functionality_keys - 1].offset ==
-          (num_runtime_entries - 1),
-      "num_runtime_entries: ",
-      num_runtime_entries,
-      "last_offset: ",
-      offsets_and_masks[num_functionality_keys - 1].offset);
-#endif
-  return offsets_and_masks;
-}
-
 } // namespace c10
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@ -1,4 +1,5 @@
 #pragma once
+
 #include <c10/core/DispatchKey.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Metaprogramming.h>
@ -7,147 +8,29 @@

 namespace c10 {

-struct FunctionalityOffsetAndMask {
-  // empty constructor shouldn't be used; only needed to initialize
-  // the array before populating it.
-  FunctionalityOffsetAndMask() {}
-  FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask)
-      : offset(offset), mask(mask) {}
-  // This needs to big enough to cover the size of the operator table.
-  uint16_t offset;
-  // See Note [No More Than 16 Backends]
-  // This mask needs to be big enough to mask all of the backend bits.
-  // We probably don't ever want to have more than 16 backend bits, so uint16_t
-  // should be enough.
-  uint16_t mask;
-};
-static_assert(
-    c10::num_runtime_entries < 65536,
-    "The dispatcher currently only supports up to 2^16 runtime entries");
-
-C10_API std::array<FunctionalityOffsetAndMask, num_functionality_keys>
-initializeFunctionalityOffsetsAndMasks();
-
-C10_ALWAYS_INLINE static const std::
-    array<FunctionalityOffsetAndMask, num_functionality_keys>&
-    offsetsAndMasks() {
-  static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks();
-  return offsets_and_masks_;
-}
-
-// A representation of a set of DispatchKeys. A DispatchKeySet contains both
-// "functionality" bits and "backend bits", and every tensor holds its own
-// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the
-// keyset on every input tensor, or’ing them together, and dispatching to a
-// specific piece of functionality. The functionality bits are *ordered*. When
-// multiple functionality bits are set, we use the highest priority
-// functionality. Similarly, multiple backend bits can theoretically be set if
-// you call an operator with multiple tensors from difference devices (e.g. CPU
-// and CUDA), although support for mixed device dispatch is limited (the only
-// kernels that gracefully handle mixed device inputs for now are cuda kernels
-// that take in a scalar cpu tensor).
-
 // A representation of a set of DispatchKeys.  A tensor may have multiple
 // tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the
 // DispatchKeySet specifies what type ids apply.  The internal representation is
 // as a 64-bit bit set (this means only 64 tensor type ids are supported).
 //
-// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like
-// "what is the highest priority DispatchKey in the set"?  (The set itself is
-// not ordered; two sets with the same ids will always have the ids ordered in
-// the same way.)
+// Note that DispatchKeys are ordered; thus, we can ask questions like "what is
+// the highest priority DispatchKey in the set"?  (The set itself is not
+// ordered; two sets with the same ids will always have the ids ordered in the
+// same way.)
 //
-// Note [DispatchKeySet Internal Representation]
-// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects
-// that get passed around at runtime.
-// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset
-// and individual dispatch keys.
+// At the moment, there are no nontrivial uses of this set; tensors are always
+// singletons.  In the near future, this set will represent variable? + tensor
+// type id.  In the far future, it will be requires grad? + profiling? +
+// tracing? + lazy? + tensor type id.
 //
-// First: why do we have this distinction, and why not map every dispatch key
-// directly to a bit? This is mostly because we have several types of
-// functionalities that different backends would like to customize. For example,
-// we have:
-// - "Dense":     CPU, CUDA, XLA, ... (~12 keys)
-// - "Sparse":    SparseCPU, SparseCUDA, ...
-// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ...
-// - "Autograd":  AutogradCPU, AutogradCUDA, Autograd XLA, ...
-// The problem is that total number of keys grows quadratically with [#
-// backends] x [# functionalities], making it very difficult to map each key
-// directly to a bit in a bitset without dramatically increasing the size of the
-// bitset over time.
+// (The difference between variable and requires grad, is that
+// there are currently three states a tensor can be:
+//  1. Not a variable
+//  2. Variable with requires_grad=False
+//  3. Variable with requires_grad=True
+// Eventually, we want to kill state (1), and only dispatch to autograd
+// handling code if one of the inputs requires grad.)
 //
-// The two enums (BackendComponent and DispatchKey) can be divided roughly into
-// 5 categories.
-//
-// (1) "Building block" keys
-//    (a) backends: jEverything in the BackendComponent enum (e.g. CPUBit,
-//    CUDABIt) (b) functionalities: (per-backend) functionality-bit DispatchKeys
-//    (e.g. AutogradFunctionality, Sparse, Dense)
-// (2) "Runtime" keys
-//    (a) "non-customizable backends" (e.g. FPGA)
-//    (b) "non-customizable functionalities" (e.g. Functionalize)
-//    (c) "per-backend instances of customizable functionalities" (e.g. CPU,
-//    SparseCPU, AutogradCPU)
-// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys])
-//
-// (1) Building block keys always correspond to individual bits in a
-// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual
-// runtime keys. e.g.
-//     auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit,
-//     DispatchKey::Dense});
-//     // The keyset has the runtime dense-cpu key.
-//     dense_cpu_ks.has(DispatchKey::CPU);
-//     // And it contains the building block keys too.
-//     dense_cpu_ks.has(DispatchKey::CPUBit);
-//     dense_cpu_ks.has(DispatchKey::Dense);
-//
-// Not every backend and not every functionality counts as a "building block
-// key". This is mostly to give us more levers to pull in the design space.
-// Backend keys and functionality keys that count as "building blocks" will
-// contribute to a full cross product of functionality that can be overriden.
-//
-// For example, right now we have at least 12 "backend" building blocks (CPU,
-// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense,
-// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow
-// every dispatcher operator to be customized in up to 12*4 different ways. Each
-// of those requires a slot in the operator table of every dispatcher operator.
-// Not every piece of functionality necessarily needs to be customizeable
-// per-backend, and not every backend necessarily needs to be able to customize
-// every type of functionality.
-//
-//
-// (2) Every runtime key corresponds directly to a slot in an operator's runtime
-// dispatch table, and you can directly register kernels to a runtime dispatch
-// key.
-//
-// For per-backend functionalities like "Dense" or "AutogradFunctionality",
-// you can think of the corresponding runtime dispatch keys as "instances" of
-// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all
-// runtime instances of the "Dense" building block key.
-
-// (2a) and (2b) are represented identically in the DispatchKeySet logic:
-// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT
-// customizeable per backend.
-//   In order to do so, we'd need to promote it to a per-backend functionality
-//   "building block" key.
-// - non-customizeable backends (e.g. FPGA) can NOT customize existing
-// functionality like Sparse, Autograd, etc.
-//   In order to do so, we'd need to promote it to a backend "building block"
-//   key.
-//
-// In both cases, these keys directly correspond to runtime slots in the
-// operator table.
-//
-//
-// (3) "Alias" keys
-// See Note [Alias Dispatch Keys]
-//
-// Final note: for anyone making future changes to the Dispatcher +
-// DispatchKeySet internals, there's a closed PR with a basic
-// python-implementation of the Dispatcher that might be useful in quickly
-// testing out and validating changes. See it at
-// https://github.com/pytorch/pytorch/pull/68743
-
 // An undefined tensor is one with an empty tensor type set.
 class DispatchKeySet final {
 public:
@ -158,146 +41,29 @@ class DispatchKeySet final {
  // NB: default constructor representation as zero is MANDATORY as
  // use of DispatchKeySet in TLS requires this.
  constexpr DispatchKeySet() : repr_(0) {}
-
  constexpr DispatchKeySet(Full)
-      : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
-
+      : repr_(std::numeric_limits<decltype(repr_)>::max()) {}
  constexpr DispatchKeySet(FullAfter, DispatchKey t)
      // LSB after t are OK, but not t itself.
-      // "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
-      // Quantized > Dense). But backends don't really have an ordering.
-      // Therefore, we're enforcing that FullAfter can only be used on
-      // "functionality" keys.
-      : repr_(
-            (1ULL
-             << (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
-                 1)) -
-            1) {}
-
+      : repr_((1ULL << (static_cast<uint8_t>(t) - 1)) - 1) {}
  // Public version of DispatchKeySet(uint64_t) API; external users
  // must be explicit when they do this!
  constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {}
-
-  constexpr explicit DispatchKeySet(BackendComponent k) {
-    if (k == BackendComponent::InvalidBit) {
-      repr_ = 0;
-    } else {
-      repr_ = 1ULL << (static_cast<uint8_t>(k) - 1);
-    }
-  }
-
-  constexpr explicit DispatchKeySet(DispatchKey k) {
-    if (k == DispatchKey::Undefined) {
-      // Case 1: handle Undefined specifically
-      repr_ = 0;
-    } else if (k <= DispatchKey::EndOfFunctionalityKeys) {
-      // Case 2: handle "functionality-only" keys
-      // These keys have a functionality bit set, but no backend bits
-      // These can technically be either:
-      // - valid runtime keys (e.g. DispatchKey::AutogradOther,
-      // DispatchKey::FuncTorchBatched, etc)
-      // - "building block" keys that aren't actual runtime keys (e.g.
-      // DispatchKey::Dense or Sparse)
-      uint64_t functionality_val = 1ULL
-          << (num_backends + static_cast<uint8_t>(k) - 1);
-      repr_ = functionality_val;
-    } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) {
-      // Case 3: "runtime" keys that have a functionality bit AND a backend bit.
-      // First compute which bit to flip for the functionality.
-      auto functionality_k = toFunctionalityKey(k);
-      // The - 1 is because Undefined is technically a "functionality" that
-      // doesn't show up in the bitset. So e.g. Dense is technically the second
-      // functionality, but the lowest functionality bit.
-      uint64_t functionality_val = 1ULL
-          << (num_backends + static_cast<uint8_t>(functionality_k) - 1);
-
-      // then compute which bit to flip for the backend
-      // Case 4a: handle the runtime instances of "per-backend functionality"
-      // keys For example, given DispatchKey::CPU, we should set:
-      // - the Dense functionality bit
-      // - the CPUBit backend bit
-      // first compute which bit to flip for the backend
-      auto backend_k = toBackendComponent(k);
-      uint64_t backend_val = backend_k == BackendComponent::InvalidBit
+  explicit constexpr DispatchKeySet(DispatchKey t)
+      : repr_(
+            t == DispatchKey::Undefined
                ? 0
-          : 1ULL << (static_cast<uint8_t>(backend_k) - 1);
-      repr_ = functionality_val + backend_val;
-    } else {
-      // At this point, we should have covered every case except for alias keys.
-      // Technically it would be possible to add alias dispatch keys to a
-      // DispatchKeySet, but the semantics are a little confusing and this
-      // currently isn't needed anywhere.
-      repr_ = 0;
-    }
-  }
-
-  constexpr uint64_t keys_to_repr(std::initializer_list<DispatchKey> ks) {
-    uint64_t repr = 0;
-    for (auto k : ks) {
-      repr |= DispatchKeySet(k).repr_;
-    }
-    return repr;
-  }
-
-  constexpr uint64_t backend_bits_to_repr(
-      std::initializer_list<BackendComponent> ks) {
-    uint64_t repr = 0;
-    for (auto k : ks) {
-      repr |= DispatchKeySet(k).repr_;
-    }
-    return repr;
-  }
-
+                : 1ULL << (static_cast<uint8_t>(t) - 1)) {}
  explicit constexpr DispatchKeySet(std::initializer_list<DispatchKey> ks)
-      : repr_(keys_to_repr(ks)) {}
-
-  explicit constexpr DispatchKeySet(std::initializer_list<BackendComponent> ks)
-      // Note: for some reason, putting this logic directly in the constructor
-      // appears to fail to compile on CUDA 10.1.
-      // See an example internal failure at
-      // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr
-      : repr_(backend_bits_to_repr(ks)) {}
-
+      : repr_(0) {
+    for (auto k : ks) {
+      repr_ |= DispatchKeySet(k).repr_;
+    }
+  }
  // Test if a DispatchKey is in the set
-  inline bool has(DispatchKey t) const {
+  bool inline has(DispatchKey t) const {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined);
-    return has_all(DispatchKeySet(t));
-  }
-  constexpr bool has_backend(BackendComponent t) const {
-    return has_all(DispatchKeySet(t));
-  }
-
-  // Test if a DispatchKey is in the set
-  // Given a DispatchKeySet of functionality keys and (potentially) backend
-  // keys, tests if all of them are in the current set.
-  constexpr bool has_all(DispatchKeySet ks) const {
-    return static_cast<bool>((repr_ & ks.repr_) == ks.repr_);
-  }
-
-  // Given a DispatchKeySet of functionality keys and (potentially) backend
-  // keys, tests if any of them are in the current set. This could technically
-  // be pretty easily implemented using has(). It is strictly a perf
-  // optimization though. There are many places in the code base where we want
-  // to test for multiple functionality keys together. HOWEVER, runtime
-  // per-backend functionality keys aren't allowed to be used with this
-  // function, because you can end up with weird results. e.g.
-  // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU))
-  // would return true.
-  inline bool has_any(DispatchKeySet ks) const {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        // Either there are no backend bits in the input keyset
-        ((ks.repr_ & full_backend_mask) == 0) ||
-        // or there are no per-backend-functionality bits
-        // See [Note: Per-Backend Functionality Dispatch Keys]
-        ((ks &
-          DispatchKeySet({
-                             DispatchKey::Dense,
-                             DispatchKey::Quantized,
-                             DispatchKey::Sparse,
-                             DispatchKey::AutogradFunctionality,
-                         })
-              .repr_) == 0));
-    return static_cast<bool>((repr_ & ks.repr_) != 0);
+    return static_cast<bool>(repr_ & DispatchKeySet(t).repr_);
  }
  // Test if DispatchKeySet is a superset of ks.
  bool isSupersetOf(DispatchKeySet ks) const {
@ -308,64 +74,31 @@ class DispatchKeySet final {
    return DispatchKeySet(repr_ | other.repr_);
  }
  // Perform set intersection
-  constexpr DispatchKeySet operator&(DispatchKeySet other) const {
+  DispatchKeySet operator&(DispatchKeySet other) const {
    return DispatchKeySet(repr_ & other.repr_);
  }
-  // Compute the set difference self - other,
-  // but ONLY for the functionality keys.
-  // Any backend bits set on self will remain unchanged.
-  // See Note [Removing keys from DispatchKeySet Only Affects Functionality
-  // Keys]
+  // Compute the set difference self - other
  DispatchKeySet operator-(DispatchKeySet other) const {
-    return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_));
+    return DispatchKeySet(repr_ & ~other.repr_);
  }
-
  // Compute self ^ other
  constexpr DispatchKeySet operator^(DispatchKeySet other) const {
    return DispatchKeySet(repr_ ^ other.repr_);
  }
+  // Perform set equality
  bool operator==(DispatchKeySet other) const {
    return repr_ == other.repr_;
  }
-  bool operator!=(DispatchKeySet other) const {
-    return repr_ != other.repr_;
-  }
  // Add a DispatchKey to the DispatchKey set.  Does NOT mutate,
  // returns the extended DispatchKeySet!
  C10_NODISCARD DispatchKeySet add(DispatchKey t) const {
    return *this | DispatchKeySet(t);
  }
-  C10_NODISCARD DispatchKeySet add(DispatchKeySet ks) const {
-    return *this | ks;
-  }
-
-  // Remove a DispatchKey from the DispatchKey set.
-  // This is generally not an operation you should be doing
-  // (it's used to implement the printing overload, operator<<)
-  //
-  // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys]
-  // Only functionality bits are allowed to be removed from a keyset.
-  // For now, we're only allowing removal of "functionality bits" from the
-  // keyset, which is specifically needed by the fallthrough key calculation
-  // logic. Why is removing backend bits problematic? Consider this example:
-  //
-  // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA,
-  // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA)
-  // DispatchKeySet([DispatchKey.CPU,
-  // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA)
-  //
-  // What do we want to happen?
-  // Technically, we'd like it to be true that after removal,
-  // the first keyset still has the CUDA dispatch key while the second doesn't.
-  // Unfortunately there's no way to represent that, because the two keysets are
-  // represented the same way internally: functionality bits: Autograd, Dense
-  // backend bits: CPU, CUDA
-  //
-  // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd"
-  // bit from the bitset.
-  constexpr DispatchKeySet remove(DispatchKey t) const {
-    return DispatchKeySet(
-        repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask));
+  // Remove a DispatchKey from the DispatchKey set.  This is
+  // generally not an operation you should be doing (it's
+  // used to implement operator<<)
+  C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const {
+    return DispatchKeySet(repr_ & ~DispatchKeySet(t).repr_);
  }
  // Is the set empty?  (AKA undefined tensor)
  bool empty() const {
@ -374,78 +107,22 @@ class DispatchKeySet final {
  uint64_t raw_repr() {
    return repr_;
  }
-
-  DispatchKey highestFunctionalityKey() const {
-    auto functionality_idx = indexOfHighestBit();
-    // This means that none of the functionality bits were set.
-    if (functionality_idx < num_backends)
-      return DispatchKey::Undefined;
-    // The first num_backend bits in the keyset don't correspond to real
-    // dispatch keys.
-    return static_cast<DispatchKey>(functionality_idx - num_backends);
-  }
-
-  // This is similar like toBackendComponent(DispatchKey), but less restrictive.
-  // toBackendComponent() errors out if the key that it was passed has no
-  // backend bits, which is useful for error checking. We need a version of that
-  // here that can also handle "fake" backends like FPGA, because they need to
-  // map to the AutogradOther key. For those backends, we return
-  // BackendComponent::InvalidBit.
-  BackendComponent highestBackendKey() const {
-    // mask to mask out functionality bits
-    auto backend_idx =
-        DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit();
-    // all zeros across the backend bits means that no backend bits are set.
-    if (backend_idx == 0)
-      return BackendComponent::InvalidBit;
-    return static_cast<BackendComponent>(backend_idx);
-  }
-
-  // returns the DispatchKey of highest priority in the set.
+  // Return the type id in this set with the highest priority (i.e.,
+  // is the largest in the DispatchKey enum).  Intuitively, this
+  // type id is the one that should handle dispatch (assuming there
+  // aren't any further exclusions or inclusions).
  DispatchKey highestPriorityTypeId() const {
-    auto functionality_k = highestFunctionalityKey();
-    if (isPerBackendFunctionalityKey(functionality_k)) {
-      return toRuntimePerBackendFunctionalityKey(
-          functionality_k, highestBackendKey());
-    }
-    return functionality_k;
+    // TODO: If I put Undefined as entry 64 and then adjust the
+    // singleton constructor to shift from the right, we can get rid of the
+    // subtraction here.  It's modestly more complicated to get right so I
+    // didn't do it for now.
+    return static_cast<DispatchKey>(64 - llvm::countLeadingZeros(repr_));
  }

-  // Returns the index of the most-significant bit in the keyset.
-  // This is used to as part of the calculation into the operator table to get:
-  // - the highest "functionality" bit in the keyset.
-  // - the highest "backend" bit in the keyset.
-  uint8_t indexOfHighestBit() const {
-    return 64 - llvm::countLeadingZeros(repr_);
-  }
-
-  // returns the index in the operator table of highest priority key in the the
-  // keyset Note that we could in theory implement this using
-  // highestPriorityTypeId(), but this code is very hotpath and we can do it
-  // faster without it.
-  uint64_t getDispatchTableIndexForDispatchKeySet() const {
-    auto functionality_idx =
-        DispatchKeySet(repr_ >> num_backends).indexOfHighestBit();
-    auto offset_and_mask = offsetsAndMasks()[functionality_idx];
-    // Mask the functionality bits out first, then right-shift by 1.
-    // right-shifting by 1 because everything is zero-indexed.
-    // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should
-    // give us an offset of 1, etc.
-    auto backend_idx =
-        DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit();
-    return offset_and_mask.offset + backend_idx;
-  }
-
-  // returns the "index" of the highest priority backend in the keyset.
-  // This is pretty similar to getBackendKey(), but:
-  // - It's hotpath code (part of the runtime bitset calculation)
-  // - I's returns an integer index, not an enum value
-  // - Everything is shifted to the right by 1.
-  //   BackendComponent::InvalidBit is technically the lowest enum value,
-  //   but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2,
-  //   etc.
-  uint64_t getBackendIndex() const {
-    return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit();
+  DispatchKey highestPriorityBackendTypeId() const {
+    return (*this &
+            ((1ULL << static_cast<uint8_t>(DispatchKey::EndOfBackendKeys)) - 1))
+        .highestPriorityTypeId();
  }

 private:
@ -453,47 +130,42 @@ class DispatchKeySet final {
  uint64_t repr_ = 0;

 public:
-  // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys
-  // in the set. The iterator is only invalidated by the destruction of the
-  // underlying DispatchKeySet as the iterator stores a pointer to the raw
-  // representation of the DispatchKeySet. Note: When we encounter a per-backend
-  // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend
-  // in the keyset, for that functionality. For example, if the next
-  // functionality key to iterate over is Autograd, and the backend bits in the
-  // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit],
-  // then the next two keys we return will be DispatchKey::AutogradCPU,
-  // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than
-  // CUDA in DispatchKey.h).
+  // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the
+  // set. The iterator is only invalidated by the destruction of the underlying
+  // DispatchKeySet as the iterator stores a pointer to the raw representation
+  // of the DispatchKeySet.
  class iterator {
   public:
    using self_type = iterator;
    using iterator_category = std::input_iterator_tag;
    using value_type = DispatchKey;
    using difference_type = ptrdiff_t;
-    // final mask value should mask out the entire keyset
-    static const uint8_t end_iter_mask_val =
-        num_backends + num_functionality_keys;
-    // final key value should be the last DispatchKey
-    static const uint8_t end_iter_key_val = num_functionality_keys;

-    // current_dispatchkey_idx_ will iterate through all functionality bits.
-    // current_backendcomponent_idx_ will iterate through all backend bits.
-    explicit iterator(
-        const uint64_t* data_ptr,
-        uint8_t next_functionality = num_backends,
-        uint8_t next_backend = 0)
-        : data_ptr_(data_ptr),
-          next_functionality_(next_functionality),
-          next_backend_(next_backend),
-          // These are in an invalid state at construction time, and set by the
-          // first increment call
-          current_dispatchkey_idx_(end_iter_key_val),
-          current_backendcomponent_idx_(end_iter_key_val) {
+    explicit iterator(const uint64_t* data_ptr, uint8_t i = 0)
+        : data_ptr_(data_ptr), i_(i) {
      // Go to the first key in the set
      ++(*this);
    }

-    C10_API self_type& operator++();
+    self_type& operator++() {
+      TORCH_INTERNAL_ASSERT(
+          i_ <= static_cast<uint8_t>(DispatchKey::NumDispatchKeys));
+
+      // Create a masked version of the set representation to ignore previous
+      // keys that we've iterated through.
+      uint64_t masked_data = llvm::maskTrailingZeros<uint64_t>(i_) & *data_ptr_;
+      uint64_t firstKeyIndex = llvm::findFirstSet(masked_data);
+
+      // If there are no keys, set to end iterator value
+      if (firstKeyIndex == std::numeric_limits<uint64_t>::max() ||
+          i_ == static_cast<uint8_t>(DispatchKey::NumDispatchKeys)) {
+        i_ = static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
+        return *this;
+      }
+
+      i_ = static_cast<uint8_t>(firstKeyIndex) + 1;
+      return *this;
+    }

    self_type operator++(int) {
      self_type previous_iterator = *this;
@ -502,50 +174,18 @@ class DispatchKeySet final {
    }

    bool operator==(const self_type& rhs) const {
-      return next_functionality_ == rhs.next_functionality_ &&
-          current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ &&
-          next_backend_ == rhs.next_backend_ &&
-          current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_;
+      return i_ == rhs.i_;
    }
    bool operator!=(const self_type& rhs) const {
-      return next_functionality_ != rhs.next_functionality_ ||
-          current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ ||
-          next_backend_ != rhs.next_backend_ ||
-          current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_;
+      return i_ != rhs.i_;
    }
    DispatchKey operator*() const {
-      auto functionality_key =
-          static_cast<DispatchKey>(current_dispatchkey_idx_);
-      if (isPerBackendFunctionalityKey(functionality_key)) {
-        auto next_key = toRuntimePerBackendFunctionalityKey(
-            functionality_key,
-            static_cast<BackendComponent>(current_backendcomponent_idx_));
-        // We expect all of the Dense, Sparse, Quantized, and Autograd keys to
-        // be ordered the same way with respect to their backends
-        TORCH_INTERNAL_ASSERT(
-            toBackendComponent(next_key) ==
-                static_cast<BackendComponent>(current_backendcomponent_idx_),
-            "Tried to map functionality key ",
-            toString(functionality_key),
-            " and backend bit ",
-            toString(
-                static_cast<BackendComponent>(current_backendcomponent_idx_)),
-            " to a runtime key, but ended up with ",
-            toString(next_key),
-            ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.",
-            " Please double check that enum for inconsistencies.");
-        return next_key;
-      } else {
-        return functionality_key;
-      }
+      return static_cast<DispatchKey>(i_);
    }

   private:
    const uint64_t* data_ptr_;
-    uint8_t next_functionality_;
-    uint8_t next_backend_;
-    uint8_t current_dispatchkey_idx_;
-    uint8_t current_backendcomponent_idx_;
+    uint8_t i_;
  };

 public:
@ -555,35 +195,31 @@ class DispatchKeySet final {
    return iterator(&repr_);
  }

-  // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat
-  // this as the end iterator.
+  // We do not need to iterate beyond NumDispatchKeys so we will treat this as
+  // the end iterator. NumDispatchKeys will always be strictly less than 64.
  iterator end() const {
-    return iterator(&repr_, iterator::end_iter_mask_val);
+    return iterator(&repr_, static_cast<uint8_t>(DispatchKey::NumDispatchKeys));
  }
 };

 C10_API std::string toString(DispatchKeySet);
 C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);

-C10_API inline uint64_t getDispatchTableIndexForDispatchKey(DispatchKey k) {
-  return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
-}
-
-// Alias key DispatchKey::Autograd maps to
-// (autograd_dispatch_keyset x full_backend_mask)
+// autograd_dispatch_keyset should include all runtime autograd keys.
+// Alias key DispatchKey::Autograd maps to autograd_dispatch_keyset.
 // NB: keys in this set also get associated with CompositeImplicitAutograd
-//
-// Note [autograd_dispatch_keyset Does Not Include Backend Bits]
-// We don't want to include any backend bits (BackendComponent::CPUBit, etc)
-// directly in autograd_dispatch_keyset.
-// Why? keysets like autograd_dispatch_keyset are commonly used to remove
-// autograd keys from a DispatchKeySet throughout the code base. However, you
-// are only allowed to remove functionality bits from a keyset, not backend
-// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality
-// Keys] for details. To be consistent and avoid confusion, we're explicitly
-// setting up autograd_dispatch_keyset to not have any backend bits.
 constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
-    DispatchKey::AutogradFunctionality,
+    DispatchKey::AutogradCPU,
+    DispatchKey::AutogradCUDA,
+    DispatchKey::AutogradXLA,
+    DispatchKey::AutogradLazy,
+    DispatchKey::AutogradNestedTensor,
+    DispatchKey::AutogradMLC,
+    DispatchKey::AutogradHPU,
+    DispatchKey::AutogradXPU,
+    DispatchKey::AutogradPrivateUse1,
+    DispatchKey::AutogradPrivateUse2,
+    DispatchKey::AutogradPrivateUse3,
    DispatchKey::AutogradOther,
 });

@ -606,39 +242,27 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
 constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView =
    autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView);

-constexpr DispatchKeySet python_ks = DispatchKeySet(DispatchKey::Python);
-
-constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse);
-
-constexpr DispatchKeySet sparse_csr_ks =
-    DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA});
-
-constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU);
-
 // backend dispatch keys that map to DispatchKey::AutogradOther
 // NB: keys in this set also get associated with CompositeImplicitAutograd
-constexpr DispatchKeySet autogradother_backends =
-    DispatchKeySet(
-        // HIP and VE aren't in this list: they now have their own backend bits
-        // which means that they can now have their own Autograd keys.
-        // Technically, HIP will now redispatch to its own custom AutogradHIP
-        // slot in the runtime table.
-        {DispatchKey::FPGA,
+constexpr DispatchKeySet autogradother_backends = DispatchKeySet(
+    {DispatchKey::HIP,
+     DispatchKey::VE,
+     DispatchKey::FPGA,
     DispatchKey::ORT,
     DispatchKey::Vulkan,
     DispatchKey::Metal,
-         DispatchKey::SparseCsrCPU,
-         DispatchKey::SparseCsrCUDA,
+     DispatchKey::QuantizedCPU,
+     DispatchKey::QuantizedCUDA,
     DispatchKey::CustomRNGKeyId,
     DispatchKey::MkldnnCPU,
-         DispatchKey::Meta,
-         // Sparse and Quantized backends also live here.
-         DispatchKey::Sparse,
-         DispatchKey::Quantized})
-    // Including the backend bits because this keyset is used during op
-    // registration, which requires looping over all runtime autogradother
-    // backend keys.
-    | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+     DispatchKey::SparseCPU,
+     DispatchKey::SparseCUDA,
+     DispatchKey::SparseHIP,
+     DispatchKey::SparseVE,
+     DispatchKey::SparseXPU,
+     DispatchKey::SparseCsrCPU,
+     DispatchKey::SparseCsrCUDA,
+     DispatchKey::Meta});

 // The set of dispatch keys that come after autograd
 // n.b. this relies on the fact that AutogradOther is currently the lowest
@ -668,36 +292,6 @@ constexpr DispatchKeySet after_func_keyset =
            // away with it by explicitly removing the key here.
            c10::DispatchKey::ADInplaceOrView);

-constexpr DispatchKeySet backend_bitset_mask =
-    DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1);
-
-constexpr auto inplace_or_view_ks =
-    DispatchKeySet(DispatchKey::ADInplaceOrView);
-constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU);
-constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU);
-constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA);
-constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA);
-constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy);
-constexpr auto autograd_mlc_ks = DispatchKeySet(DispatchKey::AutogradMLC);
-constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU);
-constexpr auto autograd_privateuse1_ks =
-    DispatchKeySet(DispatchKey::AutogradPrivateUse1);
-constexpr auto autograd_privateuse2_ks =
-    DispatchKeySet(DispatchKey::AutogradPrivateUse2);
-constexpr auto autograd_privateuse3_ks =
-    DispatchKeySet(DispatchKey::AutogradPrivateUse3);
-constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther);
-
-struct OpTableOffsetAndMask {
-  uint16_t offset;
-  uint16_t backend_mask;
-};
-
-static_assert(
-    num_backends <= 16,
-    "Right now we expect the number of backends not to exceed 16. In the (unlikely) event"
-    " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too.");
-
 // true if t is a backend dispatch key
 C10_API bool isBackendDispatchKey(DispatchKey t);

@ -713,53 +307,10 @@ C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k);
 C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);

 // Returns a DispatchKeySet of autograd related keys mapped to backend.
-// for a given backend key, use the associated autograd key.
-// for non-backend keys, use AutogradOther as a default.
-// Note: it's convenient and fast to return a default here rather than (say)
-// returning an optional<DispatchKey>, or throwing. But it makes callers
-// responsible for either a) enforcing the invariant that only backend keys
-// be passed as arguments, or b) interpreting our return value carefully.
-inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
-  switch (t) {
-    case BackendComponent::CPUBit:
-      return inplace_or_view_ks | autograd_cpu_ks;
-    case BackendComponent::XPUBit:
-      return inplace_or_view_ks | autograd_xpu_ks;
-    case BackendComponent::CUDABit:
-      return inplace_or_view_ks | autograd_cuda_ks;
-    case BackendComponent::XLABit:
-      return inplace_or_view_ks | autograd_xla_ks;
-    case BackendComponent::LazyBit:
-      return inplace_or_view_ks | autograd_lazy_ks;
-    case BackendComponent::MLCBit:
-      return inplace_or_view_ks | autograd_mlc_ks;
-    case BackendComponent::HPUBit:
-      return inplace_or_view_ks | autograd_hpu_ks;
-    case BackendComponent::PrivateUse1Bit:
-      return inplace_or_view_ks | autograd_privateuse1_ks;
-    case BackendComponent::PrivateUse2Bit:
-      return inplace_or_view_ks | autograd_privateuse2_ks;
-    case BackendComponent::PrivateUse3Bit:
-      return inplace_or_view_ks | autograd_privateuse3_ks;
-    default:
-      return inplace_or_view_ks | autograd_other_ks;
-  }
-}
+C10_API DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t);

 // Returns a DispatchKeySet of autocast related keys mapped to backend.
-inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
-  constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU);
-  constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA);
-  switch (t) {
-    case BackendComponent::CPUBit:
-      return autocast_cpu_ks;
-    case BackendComponent::CUDABit:
-    case BackendComponent::XLABit:
-      return autocast_cuda_ks;
-    default:
-      return DispatchKeySet();
-  }
-}
+C10_API DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t);

 // This API exists because we have a use case for checking
 // getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -120,11 +120,11 @@ TensorImpl::TensorImpl(

 // [Note: Python key removal]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// In most constructors for TensorImpl, you will see Python key is removed from
-// the passed in DispatchKeySet.  Why?
+// In most constructors for TensorImpl, you will see Python and PythonTLSSnapshot
+// keys are removed from the passed in DispatchKeySet.  Why?
 //
-// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a
-// nontrivial __torch_dispatch__ implementation.
+// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject for
+// the Tensor has a nontrivial __torch_dispatch__ implementation.
 //
 // When a fresh TensorImpl is created, there is *no* PyObject (this only gets
 // initialized lazily at the first point in time the Tensor passes into Python).
@ -132,8 +132,8 @@ TensorImpl::TensorImpl(
 //
 // In practice, what will happen shortly afterwards is that the TensorImpl
 // will get its PyObject initialized by Tensor._make_subclass; at this point
-// the Python dispatch key will be set and all is well.  The point is to delay
-// the dispatch key setting until that point.
+// the Python and PythonTLSSnapshot dispatch keys will be set and all is well.
+// The point is to delay the dispatch key setting until that point.

 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 TensorImpl::TensorImpl(
@ -148,7 +148,9 @@ TensorImpl::TensorImpl(
      numel_(0),
      data_type_(data_type),
      device_opt_(storage_.device()),
-      key_set_(key_set - c10::python_ks) { // See [Note: Python key removal]
+      key_set_(key_set.remove(
+          DispatchKey::Python).remove(
+          DispatchKey::PythonTLSSnapshot)) { // See [Note: Python key removal]
  init_bitfields();
  // Inference tensor doesn't have version counter.
  if (!is_inference()) {
@ -189,12 +191,12 @@ TensorImpl::TensorImpl(

  // TODO: be more explicit about the full key set at call sites so we
  // don't have to keep recomputing it here
-  auto k = key_set.highestBackendKey();
+  DispatchKey k = key_set.highestPriorityBackendTypeId();

  key_set = key_set | getAutocastRelatedKeySetFromBackend(k);

-  // See [Note: Python key removal]
-  key_set = key_set - c10::python_ks;
+  key_set =
+      key_set.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); // See [Note: Python key removal]

  // Inference tensor doesn't have autograd related keys.
  if (inference_mode) {
@ -552,7 +554,7 @@ void TensorImpl::copy_tensor_metadata_except_version_counter(
  dest_impl->storage_offset_ = src_impl->storage_offset_;
  dest_impl->data_type_ = src_impl->data_type_;
  dest_impl->device_opt_ = src_impl->device_opt_;
-  dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python);
+  dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot);
  dest_impl->is_contiguous_ = src_impl->is_contiguous_;
  dest_impl->has_contiguity_ = src_impl->has_contiguity_;
  dest_impl->is_channels_last_contiguous_ =
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -838,103 +838,103 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  bool is_sparse() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    return key_set_.has_all(c10::sparse_ks);
+    return key_set_.has(DispatchKey::SparseCPU) ||
+        key_set_.has(DispatchKey::SparseCUDA) ||
+        key_set_.has(DispatchKey::SparseHIP) ||
+        key_set_.has(DispatchKey::SparseXPU);
  }

  // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR
  // format.
  bool is_sparse_csr() const {
-    return key_set_.has_any(c10::sparse_csr_ks);
+    return key_set_.has(DispatchKey::SparseCsrCPU) ||
+        key_set_.has(DispatchKey::SparseCsrCUDA);
  }

  bool is_quantized() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized);
-    return key_set_.has_all(quantized_ks);
+    return key_set_.has(DispatchKey::QuantizedCPU) ||
+        key_set_.has(DispatchKey::QuantizedCUDA) ||
+        key_set_.has(DispatchKey::QuantizedXPU);
  }

  bool is_meta() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto meta_ks = DispatchKeySet(DispatchKey::Meta);
-    return key_set_.has_all(meta_ks);
+    return key_set_.has(DispatchKey::Meta);
  }

  bool is_cpu() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto cpu_bits_ks = DispatchKeySet(BackendComponent::CPUBit) |
-        DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::MkldnnCPU});
-    return key_set_.has_any(cpu_bits_ks);
+    return key_set_.has(DispatchKey::CPU) ||
+        key_set_.has(DispatchKey::SparseCPU) ||
+        key_set_.has(DispatchKey::SparseCsrCPU) ||
+        key_set_.has(DispatchKey::QuantizedCPU) ||
+        key_set_.has(DispatchKey::MkldnnCPU);
  }

  bool is_cuda() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto cuda_bits_ks = DispatchKeySet(BackendComponent::CUDABit) |
-        DispatchKeySet(DispatchKey::SparseCsrCUDA);
-    return key_set_.has_any(cuda_bits_ks);
+    return key_set_.has(DispatchKey::CUDA) ||
+        key_set_.has(DispatchKey::SparseCUDA) ||
+        key_set_.has(DispatchKey::SparseCsrCUDA) ||
+        key_set_.has(DispatchKey::QuantizedCUDA);
  }

  bool is_xpu() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto xpu_ks = DispatchKeySet(BackendComponent::XPUBit);
-    return key_set_.has_all(xpu_ks);
+    return key_set_.has(DispatchKey::XPU) ||
+        key_set_.has(DispatchKey::SparseXPU) ||
+        key_set_.has(DispatchKey::QuantizedXPU);
  }

  bool is_xla() const {
-    constexpr auto xla_ks = DispatchKeySet(BackendComponent::XLABit);
-    return key_set_.has_all(xla_ks);
+    return key_set_.has(DispatchKey::XLA);
  }

  bool is_hpu() const {
-    constexpr auto hpu_ks = DispatchKeySet(BackendComponent::HPUBit);
-    return key_set_.has_all(hpu_ks);
+    return key_set_.has(DispatchKey::HPU);
  }

  bool is_lazy() const {
-    constexpr auto lazy_ks = DispatchKeySet(BackendComponent::LazyBit);
-    return key_set_.has_all(lazy_ks);
+    return key_set_.has(DispatchKey::Lazy);
  }

  bool is_hip() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto hip_ks = DispatchKeySet(BackendComponent::HIPBit);
-    return key_set_.has_all(hip_ks);
+    return key_set_.has(DispatchKey::HIP) ||
+        key_set_.has(DispatchKey::SparseHIP);
  }

  bool is_ve() const {
    // NB: This method is not virtual and avoid dispatches for performance
    // reasons.
-    constexpr auto ve_ks = DispatchKeySet(BackendComponent::VEBit);
-    return key_set_.has_all(ve_ks);
+    return key_set_.has(DispatchKey::VE) || key_set_.has(DispatchKey::SparseVE);
  }

  bool is_mkldnn() const {
-    return key_set_.has_all(c10::mkldnn_ks);
+    return key_set_.has(DispatchKey::MkldnnCPU);
  }

  bool is_vulkan() const {
-    constexpr auto vulkan_ks = DispatchKeySet(DispatchKey::Vulkan);
-    return key_set_.has_all(vulkan_ks);
+    return key_set_.has(DispatchKey::Vulkan);
  }

  bool is_metal() const {
-    constexpr auto metal_ks = DispatchKeySet(DispatchKey::Metal);
-    return key_set_.has_all(metal_ks);
+    return key_set_.has(DispatchKey::Metal);
  }

  bool is_mlc() const {
-    constexpr auto mls_ks = DispatchKeySet(DispatchKey::MLC);
-    return key_set_.has_all(mls_ks);
+    return key_set_.has(DispatchKey::MLC);
  }

  bool is_ort() const {
-    constexpr auto ort_ks = DispatchKeySet(DispatchKey::ORT);
-    return key_set_.has_all(ort_ks);
+    return key_set_.has(DispatchKey::ORT);
  }

  // TODO: remove this once we don't automatically enabled Autograd dispatch
@ -950,8 +950,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  // Invariant:
  //   Inference tensor has version_counter_.enabled() == false
  bool is_inference() {
-    bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks);
-    bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset);
+    bool no_ADInplaceOrView = !key_set_.has(c10::DispatchKey::ADInplaceOrView);
+    bool no_Autograd = (key_set_ & c10::autograd_dispatch_keyset).empty();
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        no_ADInplaceOrView == no_Autograd,
        "ADInplaceOrView and Autograd keys must be on/off at the same time.");
@ -972,22 +972,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {

  Layout layout() const {
    // NB: This method is not virtual and avoid dispatches for perf.
-    // strided is also the most common layout type, so we check for
-    // strided case first.
-    // This keyset must also be kept in sync with the logic in
-    // is_sparse() / is_sparse_csr() / is_mkldnn()
-    constexpr auto sparse_and_sparsecsr_and_mkldnn_ks =
-        c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks;
-    if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) {
-      return kStrided;
-    } else if (is_sparse()) {
+    if (is_sparse()) {
      return kSparse;
    } else if (is_sparse_csr()) {
      return kSparseCsr;
-    } else {
-      TORCH_INTERNAL_ASSERT(
-          is_mkldnn(), "There is an error in the layout calculation logic.");
+    } else if (is_mkldnn()) {
      return kMkldnn;
+    } else {
+      return kStrided;
    }
  }

@ -1073,8 +1065,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   * Whether or not the imaginary part of the tensor should be negated
   */
  inline bool is_conj() const {
-    constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate);
-    return key_set_.has_all(conjugate_ks);
+    return key_set_.has(DispatchKey::Conjugate);
  }

  /**
@ -1094,8 +1085,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   * Whether or not the tensor is a zerotensor
   */
  inline bool _is_zerotensor() const {
-    constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor);
-    return key_set_.has_all(zerotensor_ks);
+    return key_set_.has(DispatchKey::ZeroTensor);
  }

  /**
@ -1115,8 +1105,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   * Whether or not the tensor should be negated
   */
  inline bool is_neg() const {
-    constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative);
-    return key_set_.has_all(negative_ks);
+    return key_set_.has(DispatchKey::Negative);
  }

  /**
@ -1487,14 +1476,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {

  void set_python_dispatch(bool k) {
    if (k) {
-      key_set_ = key_set_.add(c10::python_ks);
+      key_set_ = key_set_.add(DispatchKey::Python).add(DispatchKey::PythonTLSSnapshot);
    } else {
-      key_set_ = key_set_ - c10::python_ks;
+      key_set_ = key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot);
    }
  }

  bool is_python_dispatch() const {
-    return key_set_.has_all(c10::python_ks);
+    return key_set_.has(DispatchKey::Python);
  }

  /**
@ -1559,22 +1548,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   */
  inline bool has_compatible_shallow_copy_type(DispatchKeySet from) {
    auto is_dense = [](DispatchKeySet ts) {
-      constexpr auto dense_backends = DispatchKeySet(
-          {BackendComponent::CPUBit,
-           BackendComponent::CUDABit,
-           BackendComponent::HIPBit,
-           BackendComponent::XPUBit});
-      constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense);
-      return ts.has_any(dense_k) && ts.has_any(dense_backends);
+      return ts.has(DispatchKey::CPU) || ts.has(DispatchKey::CUDA) ||
+          ts.has(DispatchKey::HIP) || ts.has(DispatchKey::XPU);
    };
    auto is_sparse = [](DispatchKeySet ts) {
-      constexpr auto sparse_backends = DispatchKeySet(
-          {BackendComponent::CPUBit,
-           BackendComponent::CUDABit,
-           BackendComponent::HIPBit,
-           BackendComponent::XPUBit});
-      constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
-      return ts.has_any(sparse_k) && ts.has_any(sparse_backends);
+      return ts.has(DispatchKey::SparseCPU) ||
+          ts.has(DispatchKey::SparseCUDA) || ts.has(DispatchKey::SparseHIP) ||
+          ts.has(DispatchKey::SparseXPU);
    };
    return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) ||
        (is_sparse(key_set_) && is_sparse(from));
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@ -117,6 +117,20 @@ class C10_API ExcludeDispatchKeyGuard {
  DispatchKeySet exclude_;
 };

+struct C10_API ForceDispatchKeyGuard {
+ public:
+  ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) :
+      saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
+    c10::impl::_force_tls_local_dispatch_key_set(key_set);
+  }
+  ~ForceDispatchKeyGuard() {
+    c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_);
+  }
+
+ private:
+  c10::impl::LocalDispatchKeySet saved_keyset_;
+};
+
 // Non-RAII API for manipulating the thread-local dispatch state.
 // Please prefer the RAII API.  The non-RAII API may be useful when
 // the included/excluded state of a given DispatchKey must span
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@ -3,163 +3,25 @@
 #include <unordered_set>

 #include <c10/core/DispatchKeySet.h>
-#include <c10/util/irange.h>

 using namespace c10;

-// This test exists not to be comprehensive, but to more clearly show
-// what the semantics of DispatchKeySet are.
-TEST(DispatchKeySet, ShowSemantics) {
-  // the "CPU" dispatch key is an instance of a per-backend-functionality key.
-  // It corresponds to "dense" functionality, "CPU" backend.
-  // This means that it gets a dense functionality bit, and a cpu backend bit
-  // set.
-  auto undefined_set = DispatchKeySet();
-  auto dense_cpu_set = DispatchKeySet(DispatchKey::CPU);
-  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense));
-  ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit));
-  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU));
-
-  auto dense_lazy_set = DispatchKeySet(DispatchKey::Lazy);
-  ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Dense));
-  ASSERT_TRUE(dense_lazy_set.has_backend(BackendComponent::LazyBit));
-  ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Lazy));
-
-  // You can think of "Dense/Sparse", and "CPUBit/CUDABit", as "building block"
-  // dispatch keys. You are allowed to directly create keysets out of them!
-  auto dense_cpu_set_from_building_blocks = DispatchKeySet(DispatchKey::Dense) |
-      DispatchKeySet(BackendComponent::CPUBit);
-  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense));
-  ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit));
-  ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU));
-  ASSERT_EQ(dense_cpu_set, dense_cpu_set_from_building_blocks);
-
-  // Similarly, the AutogradCUDA key gets 2 bits in the keyset:
-  // The "Autograd" functionality bit, and the "CUDA" backend bit
-  auto autograd_cuda = DispatchKeySet(DispatchKey::AutogradCUDA);
-  ASSERT_TRUE(autograd_cuda.has(DispatchKey::AutogradFunctionality));
-  ASSERT_TRUE(autograd_cuda.has_backend(BackendComponent::CUDABit));
-
-  // Because DispatchKeySet uses a condensed internal representation, you cannot
-  // use it to represent the FULL cross product of backends and functionalities
-  // for example:
-  auto autograd_dense_cpu_cuda = DispatchKeySet(
-      {DispatchKey::AutogradFunctionality,
-       DispatchKey::Dense,
-       DispatchKey::CUDA,
-       DispatchKey::CPU});
-  auto fpga = DispatchKeySet(DispatchKey::FPGA);
-  auto fpga_and_cpu = DispatchKeySet({DispatchKey::FPGA, DispatchKey::CPU});
-  // this keyset has all of the building block keys:
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradFunctionality));
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::Dense));
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CUDABit));
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CPUBit));
-
-  // and it also has the "runtime" keys that correspond to the full
-  // cross-product of functionality
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU));
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU));
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CPU));
-  ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CUDA));
-
-  // This means that there's no way to represent a keyset with, say, only
-  // Autograd CUDA + Dense CPU. Instead, you should think of a keyset as
-  // inheriting the full set of functionalities + backends of its keys. This
-  // means that the below keysets are all indistinguishable from each other.
-  ASSERT_EQ(
-      autograd_dense_cpu_cuda,
-      DispatchKeySet(
-          {DispatchKey::AutogradCUDA,
-           DispatchKey::AutogradCPU,
-           DispatchKey::CUDA,
-           DispatchKey::CPU}));
-  ASSERT_EQ(
-      autograd_dense_cpu_cuda,
-      DispatchKeySet({DispatchKey::AutogradCUDA, DispatchKey::CPU}));
-  ASSERT_EQ(
-      autograd_dense_cpu_cuda,
-      DispatchKeySet({DispatchKey::CUDA, DispatchKey::AutogradCPU}));
-
-  // ~~~~~~~~~~ DispatchKeySet iterators ~~~~~~~~~~~
-
-  // Iterators allow you to iterate individually through the DispatchKey's in a
-  // DispatchKeySet
-  auto empty_set = DispatchKeySet();
-  auto t1 = empty_set.begin();
-  auto t2 = empty_set.end();
-  ASSERT_EQ(*empty_set.begin(), *empty_set.end());
-
-  // However, only keys that correspond to actual runtime indices of kernels in
-  // the operator table show up when you iterate through a keyset. i.e.
-  // DispatchKey::Dense, and BackendComponent::CPUBit won't show up in an
-  // iterator.
-  auto dense_cpu_iter = dense_cpu_set.begin();
-  ASSERT_EQ(*dense_cpu_iter++, DispatchKey::CPU);
-  ASSERT_EQ(*dense_cpu_iter, *dense_cpu_set.end());
-
-  auto autograd_dense_cpu_cuda_iter = autograd_dense_cpu_cuda.begin();
-  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CPU);
-  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CUDA);
-  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCPU);
-  ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCUDA);
-  ASSERT_EQ(*autograd_dense_cpu_cuda_iter, *autograd_dense_cpu_cuda.end());
-
-  // But other "functionality bits" that are not defined per-backend DO get
-  // their own slots in the operator table.
-  auto mixed_keyset = DispatchKeySet(BackendComponent::CPUBit) |
-      DispatchKeySet(
-                          {DispatchKey::FPGA, // runtime key
-                           DispatchKey::Functionalize, // runtime key
-                           DispatchKey::Dense}); // NOT a runtime key
-  auto mixed_iter = mixed_keyset.begin();
-  ASSERT_EQ(*mixed_iter++, DispatchKey::CPU);
-  ASSERT_EQ(*mixed_iter++, DispatchKey::FPGA);
-  ASSERT_EQ(*mixed_iter++, DispatchKey::Functionalize);
-  ASSERT_EQ(*mixed_iter, *mixed_keyset.end());
-}
-
 TEST(DispatchKeySet, Empty) {
  DispatchKeySet empty_set;
-  for (uint8_t i = 0;
-       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
+  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
       i++) {
    auto tid = static_cast<DispatchKey>(i);
-    if (tid == DispatchKey::Undefined)
-      continue;
    ASSERT_FALSE(empty_set.has(tid));
  }
  ASSERT_TRUE(empty_set.empty());
  DispatchKeySet empty_set2;
  ASSERT_TRUE(empty_set == empty_set2);
+  ASSERT_EQ(empty_set.highestPriorityTypeId(), DispatchKey::Undefined);
 }

-// This covers all keys that correspond to a single backend bit, e.g.
-// BackendComponent::CPUBit. Even though these are NOT runtime keys, we still
-// allow adding them directly to a keyset
-TEST(DispatchKeySet, SingletonBackendComponent) {
-  for (const auto i : c10::irange(1, num_backends)) {
-    auto tid = static_cast<DispatchKey>(i);
-    DispatchKeySet sing(tid);
-    ASSERT_EQ(sing, sing);
-    ASSERT_EQ(sing, DispatchKeySet().add(tid));
-    ASSERT_EQ(sing, sing.add(tid));
-    ASSERT_EQ(sing, sing | sing);
-    ASSERT_FALSE(sing.empty());
-    ASSERT_TRUE(sing.has(tid));
-  }
-}
-
-// This covers all keys that correspond to a single functionality bit:
-// - runtime, not-per-backend functionality keys, e.g.
-// DispatchKey::FuncTorchBatched
-// - runtime, "fake backend" keys, e.g. DispatchKey::FPGA
-// - NOT-runtime, per-backend functionality keys, e.g. DispatchKey::Dense
-//   Even though it's not a runtime key, we still allow adding it directly to a
-//   keyset.
-// DispatchKey::
-TEST(DispatchKeySet, SingletonFunctionalityKeys) {
-  for (const auto i : c10::irange(1, num_functionality_keys)) {
+TEST(DispatchKeySet, Singleton) {
+  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
+       i++) {
    auto tid = static_cast<DispatchKey>(i);
    DispatchKeySet sing(tid);
    ASSERT_EQ(sing, sing);
@ -168,145 +30,47 @@ TEST(DispatchKeySet, SingletonFunctionalityKeys) {
    ASSERT_EQ(sing, sing | sing);
    ASSERT_FALSE(sing.empty());
    ASSERT_TRUE(sing.has(tid));
+    ASSERT_EQ(sing.highestPriorityTypeId(), tid);
    ASSERT_EQ(sing.remove(tid), DispatchKeySet());
  }
 }

-// This covers runtime keys that are per-backend,
-// and take up more than one bit in a DispatchKeySet. They take up one
-// functionality bit + one backend bit. e.g. CPU, CUDA, SparseCPU, SparseCUDA,
-// AutogradCPU, AutogradCUDA
-TEST(DispatchKeySet, SingletonPerBackendFunctionalityKeys) {
-  for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
-       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
-       i++) {
-    auto tid = static_cast<DispatchKey>(i);
-    // Skip these because they aren't real keys.
-    if (tid == DispatchKey::StartOfDenseBackends ||
-        tid == DispatchKey::StartOfSparseBackends ||
-        tid == DispatchKey::StartOfQuantizedBackends ||
-        tid == DispatchKey::StartOfAutogradBackends) {
-      continue;
-    }
-    DispatchKeySet sing(tid);
-    ASSERT_EQ(sing, sing);
-    ASSERT_EQ(sing, DispatchKeySet().add(tid));
-    ASSERT_EQ(sing, sing.add(tid));
-    ASSERT_EQ(sing, sing | sing);
-    ASSERT_FALSE(sing.empty());
-    ASSERT_TRUE(sing.has(tid));
-
-    auto functionality_key = toFunctionalityKey(tid);
-    auto backend_key = toBackendComponent(tid);
-    // These two sets should be equivalent:
-    // DispatchKeySet(DispatchKey::CPU)
-    // DispatchKeySet({DispatchKey::Dense, BackendComponent::CPUBit})
-    auto expected_ks =
-        DispatchKeySet(functionality_key) | DispatchKeySet(backend_key);
-    ASSERT_EQ(sing, expected_ks);
-    // These two sets should be equivalent:
-    // DispatchKeySet(DispatchKey::CPU).remove(DispatchKey::Dense)
-    // DispatchKeySet(BackendComponent::CPUBit)
-    expected_ks = DispatchKeySet(toBackendComponent(tid));
-    ASSERT_EQ(sing.remove(tid), expected_ks);
-  }
-}
-
-TEST(DispatchKeySet, DoubletonPerBackend) {
-  for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
-       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
+TEST(DispatchKeySet, Doubleton) {
+  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
       i++) {
    for (uint8_t j = i + 1;
-         j <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
+         j < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
         j++) {
      ASSERT_LT(i, j);
      auto tid1 = static_cast<DispatchKey>(i);
      auto tid2 = static_cast<DispatchKey>(j);
-
-      // Skip these because they aren't real keys.
-      if (tid1 == DispatchKey::StartOfDenseBackends ||
-          tid1 == DispatchKey::StartOfSparseBackends ||
-          tid1 == DispatchKey::StartOfQuantizedBackends ||
-          tid1 == DispatchKey::StartOfAutogradBackends)
-        continue;
-      if (tid2 == DispatchKey::StartOfDenseBackends ||
-          tid2 == DispatchKey::StartOfSparseBackends ||
-          tid2 == DispatchKey::StartOfQuantizedBackends ||
-          tid2 == DispatchKey::StartOfAutogradBackends)
-        continue;
-
-      auto backend1 = toBackendComponent(tid1);
-      auto backend2 = toBackendComponent(tid2);
-      auto functionality1 = toFunctionalityKey(tid1);
-      auto functionality2 = toFunctionalityKey(tid2);
-
-      auto combined = DispatchKeySet({tid1, tid2});
-      // The combined set has the backend bits
-      ASSERT_TRUE(combined.has_backend(backend1));
-      ASSERT_TRUE(combined.has_backend(backend2));
-      // and it has the backend bits
-      ASSERT_TRUE(combined.has(functionality1));
-      ASSERT_TRUE(combined.has(functionality2));
-      // and it has the original two runtime keys
-      ASSERT_TRUE(combined.has(tid1));
-      ASSERT_TRUE(combined.has(tid2));
-
-      // Add all of the keys in the keyset to a real set
-      std::unordered_set<DispatchKey> visited_keys;
-      auto iter = combined.begin();
-      while (*iter != *combined.end()) {
-        visited_keys.insert(*iter);
-        ++iter;
-      }
-      std::unordered_set<DispatchKey> expected_keys;
-      expected_keys.insert(
-          toRuntimePerBackendFunctionalityKey(functionality1, backend1));
-      expected_keys.insert(
-          toRuntimePerBackendFunctionalityKey(functionality1, backend2));
-      expected_keys.insert(
-          toRuntimePerBackendFunctionalityKey(functionality2, backend1));
-      expected_keys.insert(
-          toRuntimePerBackendFunctionalityKey(functionality2, backend2));
-      ASSERT_EQ(expected_keys, visited_keys);
-
-      if (backend1 == backend2 || functionality1 == functionality2) {
-        // We have two runtime keys, with either the same backend or the same
-        // per-backend functionalities. E.g. {AutogradCUDA, CUDA} or
-        // {AutogradCPU, AutogradCUDA} There should be 2 total runtime keys in
-        // this set.
-        ASSERT_EQ(2, visited_keys.size());
-      } else {
-        // since i and j are different keys, they should not have the same
-        // functionality and backend
-        ASSERT_TRUE(backend1 != backend2 && functionality1 != functionality2);
-        // We have two runtime keys, that have different backends + per-backend
-        // functionalities. So we should expect the full cross product of
-        // runtime keys to be in the set. e.g. if i = AutogradCUDA, and j = CPU,
-        // then combined = {AutogradCUDA, AutogradCPU, CUDA, CPU}
-        ASSERT_EQ(4, visited_keys.size());
-      }
+      auto doub = DispatchKeySet(tid1).add(tid2);
+      ASSERT_EQ(doub, DispatchKeySet(tid1) | DispatchKeySet(tid2));
+      ASSERT_TRUE(doub.has(tid1));
+      ASSERT_TRUE(doub.has(tid2));
+      ASSERT_EQ(doub.highestPriorityTypeId(), tid2); // relies on i < j
    }
  }
 }

 TEST(DispatchKeySet, Full) {
  DispatchKeySet full(DispatchKeySet::FULL);
-  for (const auto i : c10::irange(1, num_functionality_keys)) {
+  for (uint8_t i = 1; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys);
+       i++) {
    auto tid = static_cast<DispatchKey>(i);
    ASSERT_TRUE(full.has(tid));
  }
-  ASSERT_FALSE(full.has(DispatchKey::EndOfFunctionalityKeys));
 }

 TEST(DispatchKeySet, IteratorBasicOps) {
  DispatchKeySet empty_set;
  DispatchKeySet full_set(DispatchKeySet::FULL);
-  DispatchKeySet mutated_set = empty_set.add(DispatchKey::CPU);
+  DispatchKeySet mutated_set = empty_set.add(static_cast<DispatchKey>(1));

  // Constructor + Comparison
-  ASSERT_EQ(*empty_set.begin(), DispatchKey::EndOfFunctionalityKeys);
-  ASSERT_EQ(*empty_set.end(), DispatchKey::EndOfFunctionalityKeys);
-  ASSERT_EQ(*mutated_set.begin(), DispatchKey::CPU);
+  ASSERT_EQ(*empty_set.begin(), DispatchKey::NumDispatchKeys);
+  ASSERT_EQ(*empty_set.end(), DispatchKey::NumDispatchKeys);
+  ASSERT_EQ(*mutated_set.begin(), static_cast<DispatchKey>(1));

  ASSERT_TRUE(empty_set.begin() == empty_set.end());
  ASSERT_TRUE(full_set.begin() != full_set.end());
@ -326,37 +90,16 @@ TEST(DispatchKeySet, IteratorEmpty) {
  ASSERT_EQ(i, 0);
 }

-TEST(DispatchKeySet, IteratorCrossProduct) {
-  // The iterator should return all runtime keys in the set,
-  // including the cross product of {backends} x {functionalities}
-  auto ks =
-      DispatchKeySet({BackendComponent::CPUBit, BackendComponent::CUDABit}) |
-      DispatchKeySet(
-          {DispatchKey::Dense,
-           DispatchKey::FPGA,
-           DispatchKey::AutogradFunctionality});
-
-  auto iter = ks.begin();
-  // iterate through dense backends first.
-  ASSERT_EQ(DispatchKey::CPU, *(iter++));
-  ASSERT_EQ(DispatchKey::CUDA, *(iter++));
-  // FPGA doesn't have a backend bit, so it isn't included in the cross product.
-  ASSERT_EQ(DispatchKey::FPGA, *(iter++));
-  // iterate through the autograd keys laster.
-  ASSERT_EQ(DispatchKey::AutogradCPU, *(iter++));
-  ASSERT_EQ(DispatchKey::AutogradCUDA, *(iter++));
-}
-
 TEST(DispatchKeySet, IteratorFull) {
  DispatchKeySet full_set(DispatchKeySet::FULL);
  uint8_t i = 0;

  for (const auto& it : full_set) {
    i++;
+    ASSERT_TRUE(it == static_cast<DispatchKey>(i));
+    ASSERT_TRUE(it != DispatchKey::NumDispatchKeys);
  }
-  // Total # of runtime entries includes an entry for DispatchKey::Undefined,
-  // which is not included when iterating through the DispatchKeySet.
-  ASSERT_EQ(i, num_runtime_entries - 1);
+  ASSERT_EQ(i, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) - 1);
 }

 TEST(DispatchKeySet, IteratorRangeFull) {
@ -365,61 +108,41 @@ TEST(DispatchKeySet, IteratorRangeFull) {

  for (DispatchKey dispatch_key : full_set) {
    i++;
+    ASSERT_TRUE(dispatch_key == static_cast<DispatchKey>(i));
  }

-  // Total # of runtime entries includes an entry for DispatchKey::Undefined,
-  // which is not included when iterating through the DispatchKeySet.
-  ASSERT_EQ(i, num_runtime_entries - 1);
+  ASSERT_EQ(i, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) - 1);
+}
+
+TEST(DispatchKeySet, SpecificKeys) {
+  DispatchKeySet keyset({
+      static_cast<DispatchKey>(0), // Undefined should be ignored
+      static_cast<DispatchKey>(4),
+      static_cast<DispatchKey>(10),
+      static_cast<DispatchKey>(15),
+  });
+  std::unordered_set<DispatchKey> visited_keys;
+
+  for (DispatchKey key : keyset) {
+    visited_keys.insert(key);
+  }
+
+  ASSERT_EQ(visited_keys.size(), 3);
+  ASSERT_TRUE(
+      visited_keys.find(static_cast<DispatchKey>(4)) != visited_keys.end());
+  ASSERT_TRUE(
+      visited_keys.find(static_cast<DispatchKey>(10)) != visited_keys.end());
+  ASSERT_TRUE(
+      visited_keys.find(static_cast<DispatchKey>(15)) != visited_keys.end());
 }

 TEST(DispatchKeySet, FailAtEndIterator) {
  DispatchKeySet full_set(DispatchKeySet::FULL);
  uint64_t raw_repr = full_set.raw_repr();

-  // doesn't throw
-  DispatchKeySet::iterator(&raw_repr, num_backends + num_functionality_keys);
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
  EXPECT_THROW(
      DispatchKeySet::iterator(
-          &raw_repr, num_backends + num_functionality_keys + 1),
+          &raw_repr, static_cast<uint8_t>(DispatchKey::NumDispatchKeys) + 1),
      c10::Error);
 }
-
-TEST(DispatchKeySet, TestKeyOrderingInvariants) {
-  for (uint8_t i = static_cast<uint8_t>(DispatchKey::StartOfDenseBackends);
-       i <= static_cast<uint8_t>(DispatchKey::EndOfRuntimeBackendKeys);
-       i++) {
-    auto k = static_cast<DispatchKey>(i);
-    // Note [The Ordering of Per-Backend Dispatch Keys Matters!]
-    // The DispatchKey enum includes all of the runtime keys for
-    // Dense/Sparse/Quantized/Autograd, (e.g. CPU, CUDA, SparseCPU, SparseCUDA,
-    // AutogradCPU, AutogradCUDA, etc). And we expect the ordering of those keys
-    // to be the same as the ordering of the backends in the `BackendComponent`
-    // enum. This makes several utilities in `DispatchKey.h` and
-    // `DispatchKeySet.h` significantly easier to implement. The purpose of the
-    // test is to assert (through CI) that this invariant is maintained.
-    //
-    // The only way that we can really check this invariant is by
-    // comparing the string names of each enum.
-    // We only really care about the ordering for "real" keys that are actually
-    // used, which we expect to be able to print properly. This saves us from
-    // having to enumerate the full set of possible runtime keys in
-    // DispatchKey::toString(). It also relies on toString() being implemented
-    // correctly.
-    auto functionality_str = std::string(toString(k));
-    if (functionality_str == "UNKNOWN_TENSOR_TYPE_ID")
-      continue;
-
-    auto computed_backend_k = toBackendComponent(k);
-    auto computed_backend_str = std::string(toString(computed_backend_k));
-    // Skip, e.g., the "Bit" from "CPUBit"
-    computed_backend_str =
-        computed_backend_str.substr(0, computed_backend_str.size() - 3);
-
-    ASSERT_TRUE(
-        functionality_str.find(computed_backend_str) != std::string::npos)
-        << "DispatchKey invariant broken! Found a key that is not ordered correctly"
-        << " with its backend bit. key = " << toString(k) << ", " << k
-        << ", computed backend = " << toString(computed_backend_k);
-  }
-}
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@ -12,7 +12,7 @@
 // C10
 // - Move file to `c10` namespace.
 // - Remove macro use in line 478 because the nvcc device compiler cannot handle
-// it it.
+// it.
 // - Revise constructor logic so that it is 1) consistent with c++ 17 standard
 // documented here in (8):
 // https://en.cppreference.com/w/cpp/utility/optional/optional, and 2) able to
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@ -15,13 +15,12 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
    return true;
  } else {
    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
    // mask=true means keep, and mask=false means not keep, so we will
    // generate probability depending on 1-ratio.
    at::bernoulli_distribution<double> dist(1. - ratio_);
    const float* Xdata = X.data<float>();
    float* Ydata = Y->template mutable_data<float>();
-
    auto mask = Output(1, X.sizes(), at::dtype<bool>());
    bool* mask_data = mask->template mutable_data<bool>();
    auto* gen = context_.RandGenerator();
@ -52,7 +51,7 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
    const bool* mask_data = mask.data<bool>();
    float* dXdata = dX->template mutable_data<float>();
    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
    for (int i = 0; i < dY.numel(); ++i) {
      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
      dXdata[i] = dYdata[i] * mask_data[i] * scale;
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@ -19,7 +19,6 @@ class DropoutOp final : public Operator<Context> {
        is_test_(
            this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
    CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
  }

  bool RunOnDevice() override;
@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator<Context> {
        is_test_(
            this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
    CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
  }

  bool RunOnDevice() override;
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@ -74,3 +74,35 @@ class TestDropout(serial.SerializedTestCase):
            gc, op, [X], reference_dropout_ratio0,
            # Don't check the mask with cuDNN because it's packed data
            outputs_to_check=None if engine != 'CUDNN' else [0])
+
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           output_mask=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    @settings(deadline=10000)
+    def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
+        """Test with ratio=0 for a deterministic reference impl."""
+        if in_place:
+            # Skip if trying in-place on GPU
+            assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
+            # If in-place on CPU, don't compare with GPU
+            dc = dc[:1]
+        is_test = not output_mask
+        op = core.CreateOperator("Dropout", ["X"],
+                                 ["X" if in_place else "Y"] +
+                                 (["mask"] if output_mask else []),
+                                 ratio=1.0, engine=engine,
+                                 is_test=is_test)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if not is_test:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+        def reference_dropout_ratio1(x):
+            return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool))
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_ratio1,
+            # Don't check the mask with cuDNN because it's packed data
+            outputs_to_check=None if engine != 'CUDNN' else [0])
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@ -385,7 +385,7 @@ class TestSequenceOps(serial.SerializedTestCase):
            ["shrunk_data"])

        def op_ref(data, indices):
-            unique_indices = np.unique(indices)
+            unique_indices = np.unique(indices) if len(indices)>0 else np.array([],dtype=np.int64)
            sorted_indices = np.sort(unique_indices)
            shrunk_data = np.delete(data, sorted_indices, axis=0)
            return (shrunk_data,)
--- a/caffe2/serialize/versions.h
+++ b/caffe2/serialize/versions.h
@ -110,22 +110,28 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
 //  0x2L: (Comment missing)
 //  0x3L: (Comment missing)
 //  0x4L: (update) Added schema to function tuple. Forward-compatible change.
-//  0x5L: (update) Update bytecode is sharing constant tensor files from torchscript, and only serialize
-//  extra tensors that are not in the torchscript constant table. Also update tensor storage schema adapting
-//  to the unify format, the root key of tensor storage is updated from {index} to
-//  {the_pointer_value_the_tensor.storage}, for example: `140245072983168.storage`
-//  Forward-compatibility change.
-//  0x6L: Implicit opereator versioning using number of specified argument.
-//  Refer to the summary of https://github.com/pytorch/pytorch/pull/56845
-//  for details.
-//  0x7L: Enable support for operators with default arguments plus out arguments.
-constexpr uint64_t kProducedBytecodeVersion = 0x7L;
+//  0x5L: (update) Update bytecode is sharing constant tensor files from
+//  torchscript, and only serialize extra tensors that are not in the
+//  torchscript constant table. Also update tensor storage schema adapting to
+//  the unify format, the root key of tensor storage is updated from {index} to
+//  {the_pointer_value_the_tensor.storage}, for example:
+//  `140245072983168.storage` Forward-compatibility change. 0x6L: Implicit
+//  opereator versioning using number of specified argument. Refer to the
+//  summary of https://github.com/pytorch/pytorch/pull/56845 for details. 0x7L:
+//  Enable support for operators with default arguments plus out arguments.
+//  0x8L: Emit promoted operators as instructions
+constexpr uint64_t kProducedBytecodeVersion = 0x8L;
+
+// static_assert(
+//     kProducedBytecodeVersion >= kProducedFileFormatVersion,
+//     "kProducedBytecodeVersion must be higher or equal to
+//     kProducedFileFormatVersion.");

 // Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion
 // for limited backward/forward compatibility support of bytecode. If
-// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion (in loader),
-// we should support this model_version. For example, we provide a wrapper to
-// handle an updated operator.
+// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion
+// (in loader), we should support this model_version. For example, we provide a
+// wrapper to handle an updated operator.
 constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
 constexpr uint64_t kMaxSupportedBytecodeVersion = 0x8L;

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -243,7 +243,6 @@ coverage_missing_automodule = [
    "torch.fft",
    "torch.for_onnx",
    "torch.fx.experimental",
-    "torch.fx.experimental.fx_acc",
    "torch.fx.experimental.unification",
    "torch.fx.experimental.unification.multipledispatch",
    "torch.fx.passes",
@ -326,14 +325,11 @@ release = 'master'
 # Customized html_title here.
 # Default is " ".join(project, release, "documentation") if not set
 if RELEASE:
-    # remove hash (start with 'a') from version number if any
-    version_end = torch_version.find('a')
-    if version_end == -1:
-        html_title = " ".join((project, torch_version, "documentation"))
-        version = torch_version
-    else:
-        html_title = " ".join((project, torch_version[:version_end], "documentation"))
-        version = torch_version[:version_end]
+    # Turn 1.11.0aHASH into 1.11
+    # Note: the release candidates should no longer have the aHASH suffix, but in any
+    # case we wish to leave only major.minor, even for rc builds.
+    version = '.'.join(torch_version.split('.')[:2])
+    html_title = " ".join((project, version, "documentation"))
    release = version

 # The language for content autogenerated by Sphinx. Refer to documentation
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@ -355,7 +355,7 @@ Extending :mod:`torch` with a :class:`Tensor`-like type

 .. note:: This functionality is inspired by the NumPy ``__array_function__``
          protocol. See `the NumPy documentation
-          <https://docs.scipy.org/doc/numpy/user/basics.dispatch.html#basics-dispatch>`_
+          <https://numpy.org/doc/stable/user/basics.dispatch.html#basics-dispatch>`_
          and `NEP-0018
          <https://numpy.org/neps/nep-0018-array-function-protocol.html>`_ for
          more details.
--- a/test/cpp/jit/test_backend_compiler_lib.cpp
+++ b/test/cpp/jit/test_backend_compiler_lib.cpp
@ -72,7 +72,12 @@ class BackendWithCompiler : public PyTorchBackendInterface {
    return true;
  }

-  // Since the actual compilation is done AOT,
+  // Since the actual compilation is done AOT for this backend, compile just
+  // forwards everything along. In a non toy setup this could grab information
+  // from that runtime that might be relevant to execute, such as build flags
+  // the resolution of the devices camera, or basically any runtime specific
+  // information that wouldnt be available server side where preprocess is
+  // called.
  c10::impl::GenericDict compile(
      c10::IValue processed,
      c10::impl::GenericDict method_compile_spec) override {
@ -86,8 +91,14 @@ class BackendWithCompiler : public PyTorchBackendInterface {
    return c10::impl::toGenericDict(handles);
  }

+  // Function that actually executes the model in the backend. Here there is
+  // nothing to dispatch to, so the backend is implemented locally within
+  // execute and it only supports add, subtract, and constant. In a non toy
+  // backend you can imagine how this function could be used to actually
+  // dispatch the inputs to the relevant backend/device.
  c10::impl::GenericList execute(
-      c10::IValue handle,
+      c10::IValue
+          handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)]
      c10::impl::GenericList inputs) override {
    TORCH_INTERNAL_ASSERT(inputs.size() == 2);
    c10::IValue val0 = inputs[0];
@ -107,6 +118,7 @@ class BackendWithCompiler : public PyTorchBackendInterface {
      auto start_time_us = torch::profiler::impl::getTime() / 1000;
      try {
        if (instruction.rfind("prim::Constant", 0) == 0) {
+          // 15 is the length of 'prim::Constant#' the constant val comes after
          TORCH_CHECK(
              instruction.size() > 15,
              "Constant value is expected in ",
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
--- a/Show More
+++ b/Show More