Merge remote-tracking branch 'upstream/viable/strict' into mkl-spmmd

2025-12-06 12:20:52 +01:00 · 2022-02-23 09:58:31 +00:00 · 2022-02-23 09:58:31 +00:00 · f77783c374
commit f77783c374
parent 8eedfa5872 7807a83f6e
133 changed files with 6069 additions and 2660 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -847,6 +847,7 @@ jobs:
    <<: *binary_mac_params
    macos:
      xcode: "12.0"
+      resource_class: "large"
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -161,6 +161,7 @@
    <<: *binary_mac_params
    macos:
      xcode: "12.0"
+      resource_class: "large"
    steps:
    # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
    - checkout
--- a/.github/merge_rules.json
+++ b/.github/merge_rules.json
@ -1,48 +1,54 @@
 [
   {
-    "name": "ONNX exporter",
-    "patterns": [
-       "torch/onnx/**",
-       "torch/csrc/jit/passes/onnx/**",
-       "torch/csrc/jit/passes/onnx.*",
-       "test/onnx/**",
-       "docs/source/onnx.rst",
-       "torch/csrc/jit/serialization/export.*",
-       "torch/csrc/jit/serialization/onnx.*",
-       "torch/_C/__init__.pyi.in",
-       "torch/csrc/onnx/**"
+      "name": "ONNX exporter",
+      "patterns": [
+         "torch/onnx/**",
+         "torch/csrc/jit/passes/onnx/**",
+         "torch/csrc/jit/passes/onnx.*",
+         "test/onnx/**",
+         "docs/source/onnx.rst",
+         "torch/csrc/jit/serialization/export.*",
+         "torch/csrc/jit/serialization/onnx.*",
+         "torch/_C/__init__.pyi.in",
+         "torch/csrc/onnx/**"
      ],
-    "approved_by": ["BowenBao", "garymm"],
-    "mandatory_app_id": 12274
+      "approved_by": ["BowenBao", "garymm"],
+      "mandatory_app_id": 12274
   },
   {
-    "name": "NVFuser",
-    "patterns": ["torch/csrc/jit/codegen/fuser/cuda/**", "torch/csrc/jit/codegen/cuda/**", "benchmarks/cpp/nvfuser/**"],
-    "approved_by": ["csarofeen", "ngimel"],
-    "mandatory_app_id": 12274
+      "name": "NVFuser",
+      "patterns": ["torch/csrc/jit/codegen/fuser/cuda/**", "torch/csrc/jit/codegen/cuda/**", "benchmarks/cpp/nvfuser/**"],
+      "approved_by": ["csarofeen", "ngimel"],
+      "mandatory_app_id": 12274
   },
   {
-    "name": "OSS CI",
-    "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"],
-    "approved_by": ["seemethere", "malfet", "suo", "janeyx99", "ezyang"],
-    "mandatory_app_id": 12274
+      "name": "OSS CI",
+      "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"],
+      "approved_by": ["janeyx99", "ezyang"],
+      "mandatory_app_id": 12274
   },
   {
      "name": "Documentation",
      "patterns": ["docs/**", "torch/*docs.py"],
-      "approved_by": ["mruberry", "ngimel", "albanD", "janeyx99"],
+      "approved_by": ["mruberry", "ngimel", "janeyx99"],
      "mandatory_app_id": 12274
   },
   {
      "name": "Android",
      "patterns": ["android/**"],
-      "approved_by": ["linbinyu", "kit1980", "IvanKobzarev", "malfet"],
+      "approved_by": ["linbinyu", "kit1980", "IvanKobzarev"],
      "mandatory_app_id": 12274
   },
   {
      "name": "iOS",
      "patterns": ["ios/**"],
-      "approved_by": ["linbinyu", "kit1980", "xta0", "malfet", "hanton"],
+      "approved_by": ["linbinyu", "kit1980", "xta0", "hanton"],
+      "mandatory_app_id": 12274
+   },
+   {
+      "name": "superuser",
+      "patterns": ["*"],
+      "approved_by": ["albanD", "jbschlosser", "suo", "osalpekar", "malfet", "seemethere"],
      "mandatory_app_id": 12274
   }
 ]
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -6,6 +6,10 @@
 {%- set squid_no_proxy = "localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%}
 {%- set timeout_minutes = 240 -%}

+# NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
+#       the binary builds will check out
+{%- set builder_branch = "main" -%}
+
 {%- macro concurrency(build_environment) -%}
 concurrency:
  group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -191,7 +195,9 @@ concurrency:
      - name: Checkout !{{ 'PyTorch' if repository == "pytorch/pytorch" else repository }}
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-      {%- if checkout_pr_head %}
+      {%- if branch %}
+          ref: !{{ branch }}
+      {%- elif checkout_pr_head %}
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      {%- endif %}
      {%- if deep_clone %}
@ -202,9 +208,6 @@ concurrency:
      {%- if repository != "pytorch/pytorch" %}
          repository: !{{ repository }}
      {%- endif %}
-      {%- if branch %}
-          ref: !{{ branch }}
-      {%- endif %}
      {%- if directory %}
          path: !{{ directory }}
      {%- endif %}
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -53,7 +53,7 @@ jobs:
    steps:
      !{{ common.setup_ec2_linux() }}
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", checkout_pr_head=False) }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
 {%- if config["gpu_arch_type"] == 'cuda' and config["gpu_arch_version"].startswith('11') %}
      - name: Set BUILD_SPLIT_CUDA
        run: |
@ -119,16 +119,8 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: pytorch
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: builder
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
 {%- if config["gpu_arch_type"] == "cuda" %}
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -80,7 +80,7 @@ jobs:
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        run: |
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -60,16 +60,8 @@ jobs:
    steps:
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -104,16 +96,8 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
-        with:
-          path: ${{ env.PYTORCH_ROOT }}
-          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
-        with:
-          repository: pytorch/builder
-          path: ${{ env.BUILDER_ROOT }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-linux-binary-conda.yml
+++ b/.github/workflows/generated-linux-binary-conda.yml
@ -111,6 +111,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -248,16 +249,29 @@ jobs:
        with:
          name: conda-py3_7-cpu
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -502,6 +516,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -640,16 +655,29 @@ jobs:
        with:
          name: conda-py3_7-cuda10_2
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -900,6 +928,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1041,16 +1070,29 @@ jobs:
        with:
          name: conda-py3_7-cuda11_1
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -1301,6 +1343,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1442,16 +1485,29 @@ jobs:
        with:
          name: conda-py3_7-cuda11_3
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -1702,6 +1758,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1843,16 +1900,29 @@ jobs:
        with:
          name: conda-py3_7-cuda11_5
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2102,6 +2172,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2239,16 +2310,29 @@ jobs:
        with:
          name: conda-py3_8-cpu
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -2493,6 +2577,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2631,16 +2716,29 @@ jobs:
        with:
          name: conda-py3_8-cuda10_2
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2891,6 +2989,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3032,16 +3131,29 @@ jobs:
        with:
          name: conda-py3_8-cuda11_1
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -3292,6 +3404,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3433,16 +3546,29 @@ jobs:
        with:
          name: conda-py3_8-cuda11_3
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -3693,6 +3819,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3834,16 +3961,29 @@ jobs:
        with:
          name: conda-py3_8-cuda11_5
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4093,6 +4233,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -4230,16 +4371,29 @@ jobs:
        with:
          name: conda-py3_9-cpu
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -4484,6 +4638,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -4622,16 +4777,29 @@ jobs:
        with:
          name: conda-py3_9-cuda10_2
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4882,6 +5050,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5023,16 +5192,29 @@ jobs:
        with:
          name: conda-py3_9-cuda11_1
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -5283,6 +5465,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5424,16 +5607,29 @@ jobs:
        with:
          name: conda-py3_9-cuda11_3
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -5684,6 +5880,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5825,16 +6022,29 @@ jobs:
        with:
          name: conda-py3_9-cuda11_5
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6084,6 +6294,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -6221,16 +6432,29 @@ jobs:
        with:
          name: conda-py3_10-cpu
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -6475,6 +6699,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -6613,16 +6838,29 @@ jobs:
        with:
          name: conda-py3_10-cuda10_2
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6873,6 +7111,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7014,16 +7253,29 @@ jobs:
        with:
          name: conda-py3_10-cuda11_1
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -7274,6 +7526,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7415,16 +7668,29 @@ jobs:
        with:
          name: conda-py3_10-cuda11_3
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -7675,6 +7941,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7816,16 +8083,29 @@ jobs:
        with:
          name: conda-py3_10-cuda11_5
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml
@ -112,6 +112,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -250,16 +251,29 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -505,6 +519,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -643,16 +658,29 @@ jobs:
        with:
          name: libtorch-cpu-shared-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -898,6 +926,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1036,16 +1065,29 @@ jobs:
        with:
          name: libtorch-cpu-static-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -1291,6 +1333,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1429,16 +1472,29 @@ jobs:
        with:
          name: libtorch-cpu-static-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -1685,6 +1741,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1824,16 +1881,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-shared-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2086,6 +2156,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2225,16 +2296,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-shared-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2487,6 +2571,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2626,16 +2711,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-static-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2888,6 +2986,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3027,16 +3126,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-static-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -3289,6 +3401,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3431,16 +3544,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-shared-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -3693,6 +3819,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3835,16 +3962,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-shared-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4097,6 +4237,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -4239,16 +4380,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-static-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4501,6 +4655,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -4643,16 +4798,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-static-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4905,6 +5073,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5047,16 +5216,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-shared-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -5309,6 +5491,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5451,16 +5634,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-shared-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -5713,6 +5909,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5855,16 +6052,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-static-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6117,6 +6327,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -6259,16 +6470,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-static-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6521,6 +6745,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -6663,16 +6888,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-shared-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6925,6 +7163,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7067,16 +7306,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-shared-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -7329,6 +7581,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7471,16 +7724,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-static-with-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -7733,6 +7999,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7875,16 +8142,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-static-without-deps-cxx11-abi
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml
@ -112,6 +112,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -250,16 +251,29 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -505,6 +519,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -643,16 +658,29 @@ jobs:
        with:
          name: libtorch-cpu-shared-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -898,6 +926,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1036,16 +1065,29 @@ jobs:
        with:
          name: libtorch-cpu-static-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -1291,6 +1333,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1429,16 +1472,29 @@ jobs:
        with:
          name: libtorch-cpu-static-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Pull Docker image
        run: |
          retry () {
@ -1685,6 +1741,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -1824,16 +1881,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-shared-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2086,6 +2156,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2225,16 +2296,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-shared-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2487,6 +2571,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -2626,16 +2711,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-static-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -2888,6 +2986,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3027,16 +3126,29 @@ jobs:
        with:
          name: libtorch-cuda10_2-static-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -3289,6 +3401,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3431,16 +3544,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-shared-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -3693,6 +3819,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -3835,16 +3962,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-shared-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4097,6 +4237,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -4239,16 +4380,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-static-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4501,6 +4655,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -4643,16 +4798,29 @@ jobs:
        with:
          name: libtorch-cuda11_1-static-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -4905,6 +5073,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5047,16 +5216,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-shared-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -5309,6 +5491,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5451,16 +5634,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-shared-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -5713,6 +5909,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -5855,16 +6052,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-static-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6117,6 +6327,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -6259,16 +6470,29 @@ jobs:
        with:
          name: libtorch-cuda11_3-static-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6521,6 +6745,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -6663,16 +6888,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-shared-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -6925,6 +7163,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7067,16 +7306,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-shared-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -7329,6 +7581,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7471,16 +7724,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-static-with-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
@ -7733,6 +7999,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -7875,16 +8142,29 @@ jobs:
        with:
          name: libtorch-cuda11_5-static-without-deps-pre-cxx11
          path: "${{ runner.temp }}/artifacts/"
-      - name: Clone pytorch/pytorch
-        uses: actions/checkout@v2
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          path: pytorch
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
-      - name: Clone pytorch/builder
-        uses: actions/checkout@v2
+          path: pytorch
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
+          ref: main
+          submodules: recursive
          repository: pytorch/builder
          path: builder
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        working-directory: pytorch/
        run: |
--- a/.github/workflows/generated-linux-binary-manywheel.yml
+++ b/.github/workflows/generated-linux-binary-manywheel.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda.yml
@ -87,7 +87,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -284,7 +284,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -481,7 +481,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-arm64-binary-wheel.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel.yml
@ -87,7 +87,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -284,7 +284,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -481,7 +481,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -678,7 +678,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-conda.yml
+++ b/.github/workflows/generated-macos-binary-conda.yml
@ -85,7 +85,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -282,7 +282,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -479,7 +479,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -676,7 +676,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml
@ -90,7 +90,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -293,7 +293,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -496,7 +496,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -699,7 +699,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml
@ -90,7 +90,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -293,7 +293,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -496,7 +496,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -699,7 +699,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-macos-binary-wheel.yml
+++ b/.github/workflows/generated-macos-binary-wheel.yml
@ -85,7 +85,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -282,7 +282,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -479,7 +479,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -676,7 +676,7 @@ jobs:
      - name: Checkout pytorch/builder
        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          ref: main
          submodules: recursive
          repository: pytorch/builder
          path: builder
--- a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml
--- a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml
--- a/.github/workflows/generated-windows-binary-wheel.yml
+++ b/.github/workflows/generated-windows-binary-wheel.yml
--- a/android/README.md
+++ b/android/README.md
@ -14,9 +14,16 @@ repositories {
    jcenter()
 }

+# lite interpreter build
 dependencies {
-    implementation 'org.pytorch:pytorch_android:1.6.0'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.6.0'
+    implementation 'org.pytorch:pytorch_android_lite:1.10.0'
+    implementation 'org.pytorch:pytorch_android_torchvision_lite:1.10.0'
+}
+
+# full jit build
+dependencies {
+    implementation 'org.pytorch:pytorch_android:1.10.0'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.10.0'
 }
 ```

@ -32,6 +39,15 @@ repositories {
    }
 }

+# lite interpreter build
+dependencies {
+    ...
+    implementation 'org.pytorch:pytorch_android_lite:1.12.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android_torchvision_lite:1.12.0-SNAPSHOT'
+    ...
+}
+
+# full jit build
 dependencies {
    ...
    implementation 'org.pytorch:pytorch_android:1.12.0-SNAPSHOT'
@ -68,7 +84,7 @@ They are specified as environment variables:

 `ANDROID_HOME` - path to [Android SDK](https://developer.android.com/studio/command-line/sdkmanager.html)

-`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk)
+`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk). It's recommended to use NDK 21.x.

 `GRADLE_HOME` - path to [gradle](https://gradle.org/releases/)

@ -133,7 +149,7 @@ android {
 }

 dependencies {
-    extractForNativeBuild('org.pytorch:pytorch_android:1.6.0')
+    extractForNativeBuild('org.pytorch:pytorch_android:1.10.0')
 }

 task extractAARForNativeBuild {
--- a/android/common.sh
+++ b/android/common.sh
@ -29,7 +29,8 @@ check_gradle() {
 }

 parse_abis_list() {
-  ABIS_LIST="x86"
+  # sync with https://github.com/pytorch/pytorch/blob/0ca0e02685a9d033ac4f04e2fa5c8ba6dbc5ae50/android/gradle.properties#L1
+  ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
  CUSTOM_ABIS_LIST=false
  if [ $# -gt 0 ]; then
    ABIS_LIST=$1
--- a/android/pytorch_android/build.gradle
+++ b/android/pytorch_android/build.gradle
@ -50,7 +50,17 @@ android {
        }
        androidTest {
            java {
-                exclude 'org/pytorch/PytorchHostTests.java'
+                if(System.env.BUILD_LITE_INTERPRETER == '0') {
+                    println 'Build test for full jit (pytorch_jni)'
+                    exclude 'org/pytorch/PytorchHostTests.java'
+                    exclude 'org/pytorch/PytorchLiteInstrumentedTests.java'
+                    exclude 'org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java'
+                } else {
+                    println 'Build test for lite interpreter (pytorch_jni_lite)'
+                    exclude 'org/pytorch/PytorchHostTests.java'
+                    exclude 'org/pytorch/PytorchInstrumentedTests.java'
+                    exclude 'org/pytorch/suite/PytorchInstrumentedTestSuite.java'
+                }
            }
        }
    }
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@ -1,4 +1,6 @@
 import torch
+from torch import Tensor
+from typing import Dict, List, Tuple, Optional

 OUTPUT_DIR = "src/androidTest/assets/"

@ -7,7 +9,8 @@ def scriptAndSave(module, fileName):
    script_module = torch.jit.script(module)
    print(script_module.graph)
    outputFileName = OUTPUT_DIR + fileName
-    script_module.save(outputFileName)
+    # note that the lite interpreter model can also be used in full JIT
+    script_module._save_for_lite_interpreter(outputFileName)
    print("Saved to " + outputFileName)
    print('=' * 80)

--- a/android/pytorch_android/host/build.gradle
+++ b/android/pytorch_android/host/build.gradle
@ -25,6 +25,7 @@ sourceSets {
        java {
            srcDir '../src/androidTest/java'
            exclude '**/PytorchInstrumented*'
+            exclude '**/PytorchLiteInstrumented*'
        }
        resources.srcDirs = ["../src/androidTest/assets"]
    }
--- a/android/pytorch_android/src/androidTest/assets/test.pt
+++ b/android/pytorch_android/src/androidTest/assets/test.pt
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
@ -10,7 +10,11 @@ import java.util.Objects;
 public class PytorchHostTests extends PytorchTestBase {

  @Override
-  protected String assetFilePath(String assetName) throws IOException {
+  protected Module loadModel(String path) throws IOException {
+    return Module.load(assetFilePath(path));
+  }
+
+  private String assetFilePath(String assetName) throws IOException {
    Path tempFile = Files.createTempFile("test", ".pt");
    try (InputStream resource =
        Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("test.pt"))) {
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
@ -14,7 +14,11 @@ import org.junit.runner.RunWith;
 public class PytorchInstrumentedTests extends PytorchTestBase {

  @Override
-  protected String assetFilePath(String assetName) throws IOException {
+  protected Module loadModel(String path) throws IOException {
+    return Module.load(assetFilePath(path));
+  }
+
+  private String assetFilePath(String assetName) throws IOException {
    final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
    File file = new File(appContext.getFilesDir(), assetName);
    if (file.exists() && file.length() > 0) {
@ -35,4 +39,5 @@ public class PytorchInstrumentedTests extends PytorchTestBase {
      throw e;
    }
  }
+
 }
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java
@ -0,0 +1,46 @@
+package org.pytorch;
+
+import android.content.Context;
+
+import androidx.test.InstrumentationRegistry;
+import androidx.test.runner.AndroidJUnit4;
+
+import org.junit.runner.RunWith;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+@RunWith(AndroidJUnit4.class)
+public class PytorchLiteInstrumentedTests extends PytorchTestBase {
+
+  @Override
+  protected Module loadModel(String path) throws IOException {
+    return LiteModuleLoader.load(assetFilePath(path));
+  }
+
+  private String assetFilePath(String assetName) throws IOException {
+    final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
+    File file = new File(appContext.getFilesDir(), assetName);
+    if (file.exists() && file.length() > 0) {
+      return file.getAbsolutePath();
+    }
+
+    try (InputStream is = appContext.getAssets().open(assetName)) {
+      try (OutputStream os = new FileOutputStream(file)) {
+        byte[] buffer = new byte[4 * 1024];
+        int read;
+        while ((read = is.read(buffer)) != -1) {
+          os.write(buffer, 0, read);
+        }
+        os.flush();
+      }
+      return file.getAbsolutePath();
+    } catch (IOException e) {
+      throw e;
+    }
+  }
+
+}
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@ -16,7 +16,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testForwardNull() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final IValue input = IValue.from(Tensor.fromBlob(Tensor.allocateByteBuffer(1), new long[] {1}));
    assertTrue(input.isTensor());
    final IValue output = module.forward(input);
@ -25,7 +25,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEqBool() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    for (boolean value : new boolean[] {false, true}) {
      final IValue input = IValue.from(value);
      assertTrue(input.isBool());
@ -38,7 +38,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEqInt() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) {
      final IValue input = IValue.from(value);
      assertTrue(input.isLong());
@ -51,7 +51,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEqFloat() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    double[] values =
        new double[] {
          -Double.MAX_VALUE,
@ -86,7 +86,7 @@ public abstract class PytorchTestBase {
    }
    final Tensor inputTensor = Tensor.fromBlob(inputTensorData, inputTensorShape);

-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final IValue input = IValue.from(inputTensor);
    assertTrue(input.isTensor());
    assertTrue(inputTensor == input.toTensor());
@ -103,7 +103,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEqDictIntKeyIntValue() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final Map<Long, IValue> inputMap = new HashMap<>();

    inputMap.put(Long.MIN_VALUE, IValue.from(-Long.MIN_VALUE));
@ -127,7 +127,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEqDictStrKeyIntValue() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final Map<String, IValue> inputMap = new HashMap<>();

    inputMap.put("long_min_value", IValue.from(Long.MIN_VALUE));
@ -151,7 +151,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testListIntSumReturnTuple() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);

    for (int n : new int[] {0, 1, 128}) {
      long[] a = new long[n];
@ -178,7 +178,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testOptionalIntIsNone() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);

    assertFalse(module.runMethod("optionalIntIsNone", IValue.from(1l)).toBool());
    assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).toBool());
@ -186,7 +186,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testIntEq0None() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);

    assertTrue(module.runMethod("intEq0None", IValue.from(0l)).isNull());
    assertTrue(module.runMethod("intEq0None", IValue.from(1l)).toLong() == 1l);
@ -194,7 +194,7 @@ public abstract class PytorchTestBase {

  @Test(expected = IllegalArgumentException.class)
  public void testRunUndefinedMethod() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    module.runMethod("test_undefined_method_throws_exception");
  }

@ -241,7 +241,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEqString() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    String[] values =
        new String[] {
          "smoketest",
@ -260,7 +260,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testStr3Concat() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    String[] values =
        new String[] {
          "smoketest",
@ -281,7 +281,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testEmptyShape() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final long someNumber = 43;
    final IValue input = IValue.from(Tensor.fromBlob(new long[] {someNumber}, new long[] {}));
    final IValue output = module.runMethod("newEmptyShapeWithItem", input);
@ -293,7 +293,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testAliasWithOffset() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final IValue output = module.runMethod("testAliasWithOffset");
    assertTrue(output.isTensorList());
    Tensor[] tensors = output.toTensorList();
@ -303,7 +303,7 @@ public abstract class PytorchTestBase {

  @Test
  public void testNonContiguous() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final IValue output = module.runMethod("testNonContiguous");
    assertTrue(output.isTensor());
    Tensor value = output.toTensor();
@ -316,7 +316,7 @@ public abstract class PytorchTestBase {
    long[] inputShape = new long[] {1, 3, 2, 2};
    long[] data = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104};
    Tensor inputNHWC = Tensor.fromBlob(data, inputShape, MemoryFormat.CHANNELS_LAST);
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final IValue outputNCHW = module.runMethod("contiguous", IValue.from(inputNHWC));
    assertIValueTensor(
        outputNCHW,
@ -334,7 +334,7 @@ public abstract class PytorchTestBase {
    long[] dataNHWDC = new long[] {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16};

    Tensor inputNHWDC = Tensor.fromBlob(dataNHWDC, shape, MemoryFormat.CHANNELS_LAST_3D);
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
    final IValue outputNCHWD = module.runMethod("contiguous", IValue.from(inputNHWDC));
    assertIValueTensor(outputNCHWD, MemoryFormat.CONTIGUOUS, shape, dataNCHWD);

@ -358,7 +358,7 @@ public abstract class PytorchTestBase {
    long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
    Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST);

-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);

    final IValue outputNCHW =
        module.runMethod("conv2d", IValue.from(inputNCHW), IValue.from(wNCHW), IValue.from(false));
@ -389,5 +389,5 @@ public abstract class PytorchTestBase {
    assertArrayEquals(expectedData, t.getDataAsLongArray());
  }

-  protected abstract String assetFilePath(String assetName) throws IOException;
+  protected abstract Module loadModel(String assetName) throws IOException;
 }
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java
@ -0,0 +1,9 @@
+package org.pytorch.suite;
+
+import org.junit.runner.RunWith;
+import org.junit.runners.Suite;
+import org.pytorch.PytorchLiteInstrumentedTests;
+
+@RunWith(Suite.class)
+@Suite.SuiteClasses({PytorchLiteInstrumentedTests.class})
+public class PytorchLiteInstrumentedTestSuite {}
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -2,10 +2,18 @@
  Provides the implementations of CUDA BLAS function templates.
 */

+#include <ATen/ATen.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
-#include <c10/util/irange.h>
+#include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Export.h>
+#include <c10/util/irange.h>
+
+// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
+// added bf16 support
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+#include <cublasLt.h>
+#endif

 #define CUDABLAS_POSINT_CHECK(FD, X)         \
  TORCH_CHECK(                               \
@ -540,6 +548,256 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 }
 #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000

+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+
+namespace {
+// Following the pattern of CuSparseDescriptor
+// Defined here for now because this is the only place cublas_lt interface is
+// used but can be moved to a header once cublas_lt interface is used in
+// multiple places.
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct CuBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, cublasStatus_t (*destructor)(T*)>
+class CuBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
+                                     cublasLtMatmulDescOpaque_t,
+                                     &cublasLtMatmulDescDestroy> {
+ public:
+  CuBlasLtMatmulDescriptor(
+      cublasComputeType_t compute_type,
+      cudaDataType_t scale_type) {
+    cublasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(
+        cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
+                                 cublasLtMatrixLayoutOpaque_t,
+                                 &cublasLtMatrixLayoutDestroy> {
+ public:
+  CuBlasLtMatrixLayout(
+      cudaDataType_t type,
+      uint64_t rows,
+      uint64_t cols,
+      int64_t ld) {
+    cublasLtMatrixLayout_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(
+        cublasLtMatrixLayoutCreate(&raw_descriptor, type, rows, cols, ld));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+                                     cublasLtMatmulPreferenceOpaque_t,
+                                     &cublasLtMatmulPreferenceDestroy> {
+ public:
+  CuBlasLtMatmulPreference() {
+    cublasLtMatmulPreference_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+} // namespace
+
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld) {
+  using opmath_t = at::opmath_type<Dtype>;
+  opmath_t beta_val = 0; // bias is added in epilogue
+
+  cudaDataType_t abcType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  if (std::is_same<Dtype, double>::value) {
+    abcType = CUDA_R_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_R_64F;
+  } else if (std::is_same<Dtype, float>::value) {
+    if (at::globalContext().allowTF32CuBLAS()) {
+      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+    }
+    abcType = CUDA_R_32F;
+  } else if (std::is_same<Dtype, at::Half>::value) {
+    abcType = CUDA_R_16F;
+  } else if (std::is_same<Dtype, at::BFloat16>::value) {
+    abcType = CUDA_R_16BF;
+  }
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  cublasOperation_t transa = transpose_mat1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_TRANSA,
+      &transa,
+      sizeof(transa)));
+  cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_TRANSB,
+      &transb,
+      sizeof(transb)));
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_EPILOGUE,
+      &epilogue,
+      sizeof(epilogue)));
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+      computeDesc.descriptor(),
+      CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+      &bias,
+      sizeof(Dtype*)));
+
+  CuBlasLtMatrixLayout Adesc(
+      abcType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld);
+  CuBlasLtMatrixLayout Bdesc(
+      abcType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
+
+  CuBlasLtMatmulPreference preference;
+  size_t workspaceSize = 0;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+      preference.descriptor(),
+      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+      &workspaceSize,
+      sizeof(workspaceSize)));
+
+  auto workspace = at::empty(
+      {static_cast<int64_t>(workspaceSize)},
+      at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
+
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  cublasLtHandle_t ltHandle =
+      reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+  }
+
+  TORCH_CUDABLAS_CHECK(cublasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha_val,
+      mat1_ptr,
+      Adesc.descriptor(),
+      mat2_ptr,
+      Bdesc.descriptor(),
+      &beta_val,
+      result_ptr,
+      Cdesc.descriptor(),
+      result_ptr,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.data_ptr(),
+      workspaceSize,
+      at::cuda::getCurrentCUDAStream()));
+}
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<double> alpha_val,
+    const double* mat1_ptr,
+    int64_t mat1_ld,
+    const double* mat2_ptr,
+    int64_t mat2_ld,
+    const double* bias,
+    double* result_ptr,
+    int64_t result_ld);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<float> alpha_val,
+    const float* mat1_ptr,
+    int64_t mat1_ld,
+    const float* mat2_ptr,
+    int64_t mat2_ld,
+    const float* bias,
+    float* result_ptr,
+    int64_t result_ld);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::Half> alpha_val,
+    const at::Half* mat1_ptr,
+    int64_t mat1_ld,
+    const at::Half* mat2_ptr,
+    int64_t mat2_ld,
+    const at::Half* bias,
+    at::Half* result_ptr,
+    int64_t result_ld);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::BFloat16> alpha_val,
+    const at::BFloat16* mat1_ptr,
+    int64_t mat1_ld,
+    const at::BFloat16* mat2_ptr,
+    int64_t mat2_ld,
+    const at::BFloat16* bias,
+    at::BFloat16* result_ptr,
+    int64_t result_ld);
+#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+
 template <>
 void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
  TORCH_CUDABLAS_CHECK(cublasStrsm(
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -70,6 +70,24 @@ template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 #endif

+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld);
+#endif
+
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
      const Dtype *a, int64_t lda, int64_t stridea,                                           \
--- a/aten/src/ATen/native/attention.cpp
+++ b/aten/src/ATen/native/attention.cpp
@ -1,339 +0,0 @@
-#include <type_traits>
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/Parallel.h>
-#include <ATen/cpu/vec/vec256/vec256.h>
-
-namespace at {
-
-namespace native {
-
-namespace {
-
-Tensor gemm_nt(const Tensor& a, const Tensor& b) {
-  return at::native::matmul(a, b.t());
-}
-
-template <typename scalar_t>
-void transform_bias_rescale_qkv_inner_loop(
-    int64_t B,
-    int64_t T,
-    int64_t _3D,
-    int64_t D,
-    int64_t num_head,
-    int64_t dim_per_head,
-    scalar_t* qkv_data,
-    scalar_t* qkv_bias_data,
-    scalar_t* q_k_v_data,
-    scalar_t sqrt_dim_per_head,
-    int64_t begin,
-    int64_t end) {
-  for (auto i : c10::irange(begin, end)) {
-    auto t = i % T;
-    i /= T;
-    auto nh = i % num_head;
-    i /= num_head;
-    auto b = i;
-    using Vec = vec::Vectorized<scalar_t>;
-    auto V = vec::Vectorized<scalar_t>::size();
-    auto dh = 0;
-    auto d = nh * dim_per_head;
-    for (; dh + V <= dim_per_head; dh += V, d += V) {
-      // load
-      auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]);
-      auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]);
-      auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]);
-
-      auto q_data =
-        Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) +
-        q_bias_data;
-      auto k_data =
-        Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) +
-        k_bias_data;
-      auto v_data =
-        Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) +
-        v_bias_data;
-
-      q_data = q_data / Vec(sqrt_dim_per_head);
-
-      q_data.store(&q_k_v_data
-                   [0 * B * num_head * T * dim_per_head +
-                    b * num_head * T * dim_per_head +
-                    nh * T * dim_per_head +
-                    t * dim_per_head + dh]);
-      k_data.store(&q_k_v_data
-                   [1 * B * num_head * T * dim_per_head +
-                    b * num_head * T * dim_per_head +
-                    nh * T * dim_per_head +
-                    t * dim_per_head + dh]);
-      v_data.store(&q_k_v_data
-                   [2 * B * num_head * T * dim_per_head +
-                    b * num_head * T * dim_per_head +
-                    nh * T * dim_per_head +
-                    t * dim_per_head + dh]);
-    }
-    for (; dh < dim_per_head; dh++) {
-      auto d = nh * dim_per_head + dh;
-      auto q_bias = qkv_bias_data[d + 0 * D];
-      auto k_bias = qkv_bias_data[d + 1 * D];
-      auto v_bias = qkv_bias_data[d + 2 * D];
-      auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias;
-      auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias;
-      auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias;
-      q_data = q_data / sqrt_dim_per_head;
-      q_k_v_data[0 * B * num_head * T * dim_per_head +
-                 b * num_head * T * dim_per_head +
-                 nh * T * dim_per_head +
-                 t * dim_per_head + dh] = q_data;
-      q_k_v_data[1 * B * num_head * T * dim_per_head +
-                 b * num_head * T * dim_per_head +
-                 nh * T * dim_per_head +
-                 t * dim_per_head + dh] = k_data;
-      q_k_v_data[2 * B * num_head * T * dim_per_head +
-                 b * num_head * T * dim_per_head +
-                 nh * T * dim_per_head +
-                 t * dim_per_head + dh] = v_data;
-    }
-  }
-}
-
-// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
-std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
-    const Tensor& qkv,
-    const Tensor& qkv_bias,
-    const int64_t num_head) {
-  auto B = qkv.size(0);
-  auto T = qkv.size(1);
-  auto _3D = qkv.size(2);
-  auto D = _3D / 3;
-  TORCH_CHECK(D % num_head == 0);
-  TORCH_CHECK(_3D % 3 == 0);
-  const auto dim_per_head = D / num_head;
-  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options());
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v.is_contiguous());
-
-  const auto qkv_contig = qkv.expect_contiguous();
-  const auto qkv_bias_contig = qkv_bias.expect_contiguous();
- AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      qkv.scalar_type(),
-      "transform_bias_rescale_qkv",
-      [&] {
-        scalar_t* qkv_data = qkv_contig->data_ptr<scalar_t>();
-        scalar_t* qkv_bias_data = qkv_bias_contig->data_ptr<scalar_t>();
-        scalar_t* q_k_v_data = q_k_v.data_ptr<scalar_t>();
-        const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(dim_per_head));
-
-        int64_t grain_size =
-            std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1);
-        parallel_for(
-            0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) {
-              transform_bias_rescale_qkv_inner_loop(B, T, _3D, D, num_head, dim_per_head, qkv_data, qkv_bias_data, q_k_v_data, sqrt_dim_per_head, begin, end);
-            });
-      });
-  auto q_k_v_s =
-      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v_s.size() == 3);
-  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
-}
-
-Tensor bmm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  auto bt_ = b_.transpose(2, 1);
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, bt_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
-}
-
-void masked_softmax_dropout(
-    Tensor& attn_scores,
-    const c10::optional<Tensor>& attn_mask) {
-  auto B = attn_scores.size(0);
-  auto num_heads = attn_scores.size(1);
-  auto T = attn_scores.size(2);
-  if (attn_mask) {
-    TORCH_CHECK(attn_mask->is_contiguous());
-  } else {
-    at::_softmax_out(attn_scores, attn_scores, 3, false);
-    return;
-  }
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      attn_scores.scalar_type(),
-      "masked_softmax_dropout",
-      [&] {
-        using accscalar_t = acc_type<scalar_t, false>;
-        // TODO: proper implementation with masking.
-        scalar_t* attn_scores_data = attn_scores.data_ptr<scalar_t>();
-        int64_t grain_size = std::min(internal::GRAIN_SIZE / T, (int64_t)1);
-        parallel_for(
-            0, B * num_heads * T, grain_size, [&](int64_t begin, int64_t end) {
-              for (const auto i : c10::irange(begin, end)) {
-                using Vec = vec::Vectorized<scalar_t>;
-                auto V = vec::Vectorized<scalar_t>::size();
-
-                scalar_t* input_data = attn_scores_data + i;
-                auto max_input = Vec(std::numeric_limits<scalar_t>::lowest());
-                // TODO: handle epilogue
-                TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
-                for (auto t = 0; t < T; t += V) {
-                  auto v = Vec::loadu(&input_data[t]);
-                  max_input = vec::maximum(max_input, v);
-                }
-
-                auto hmax = std::numeric_limits<scalar_t>::lowest();
-                for (auto i = 0; i < V; ++i) {
-                  hmax = std::max(max_input[i], hmax);
-                }
-                accscalar_t hsum = 0;
-                TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
-                for (auto t = 0; t < T; t += V) {
-                  auto v = Vec::loadu(&input_data[t]);
-                  // TODO: vectorize in accscalar_t?
-                  for (auto i = 0; i < V; ++i) {
-                    hsum += std::exp(static_cast<accscalar_t>(v[i]) - hmax);
-                  }
-                }
-                auto inv_denominator = 1.0 / hsum;
-                TORCH_CHECK(T % V == 0, "epilogue not implemented yet");
-                for (auto t = 0; t < T; t += V) {
-                  Vec v = Vec::loadu(&input_data[t]);
-
-                  // TODO: vectorize in accscalar_t?
-                  // TODO this faster solution does not work on Android build
-                  /*
-                  for (auto i = 0; i < V; ++i) {
-                    v[i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
-                  }
-                  v.store(&input_data[t]);
-                  */
-                  for (auto i = 0; i < V; ++i) {
-                    input_data[t + i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
-                  }
-                }
-              }
-            });
-      });
-}
-
-Tensor bmm_nn(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
-}
-
-Tensor transform_0213(const Tensor& a) {
-  // TODO: check perf vs dedicated kernel.
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
-  return a.permute({0, 2, 1, 3})
-      .contiguous()
-      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
-}
-
-Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto r_ = at::native::linear(a_, b, c);
-  return r_.view({a.size(0), a.size(1), r_.size(1)});
-}
-
-void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
-  for (auto idx : c10::irange(shape.size())) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
-  }
-}
-
-} // namespace
-
-std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_op_cpu(
-    const Tensor& qkv,
-    const Tensor& qkv_bias,
-    const int64_t num_head) {
-  auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone());
-}
-
-Tensor multi_head_self_attention_cpu(
-    const Tensor& query,
-    const Tensor& qkv_weight,
-    const Tensor& qkv_bias,
-    const Tensor& proj_weight,
-    const Tensor& proj_bias,
-    const int64_t num_head,
-    const c10::optional<Tensor>& mask) {
-  // query shape: [B, T, D]
-  // qkv_weight shape: [3 * D, D]
-
-  const auto D = query.sizes()[2];
-
-  TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
-  TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
-  TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
-  TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
-  TORCH_CHECK(qkv_bias.dim() == 1, "expected 2-dimensional qkv_bias, got ", qkv_bias.dim(), "-D tensor");
-  TORCH_CHECK(qkv_bias.sizes()[0] == 3 * D, "expected qkv_bias first dim and first dim of query to be equal");
-  TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
-
-#ifndef NDEBUG
-  const auto B = query.sizes()[0];
-  const auto T = query.sizes()[1];
-  const auto dim_per_head = D / num_head;
-#endif
-
-  // shape: [B, T, 3 x D]
-  auto qkv = gemm_nt(query, qkv_weight);
-#ifndef NDEBUG
-  debug_assert_shape(qkv, {B, T, 3 * D});
-#endif
-
-  // shape: 3 x [B, num_head, T, dim_per_head]
-  auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  const auto& q = std::get<0>(q_k_v);
-  const auto& k = std::get<1>(q_k_v);
-  const auto& v = std::get<2>(q_k_v);
-#ifndef NDEBUG
-  debug_assert_shape(q, {B, num_head, T, dim_per_head});
-  debug_assert_shape(k, {B, num_head, T, dim_per_head});
-  debug_assert_shape(v, {B, num_head, T, dim_per_head});
-#endif
-
-  // shape: [B, num_head, T, T]
-  auto qkt = bmm_nt(q, k);
-#ifndef NDEBUG
-  debug_assert_shape(qkt, {B, num_head, T, T});
-#endif
-
-  // shape: [B, num_head, T, T]
-  masked_softmax_dropout(qkt, mask);
-
-  // shape: [B, num_head, T, dim_per_head]
-  auto attn_ctx = bmm_nn(qkt, v);
-#ifndef NDEBUG
-  debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
-#endif
-
-  // shape: [B, T, D]
-  auto attn = transform_0213(attn_ctx);
-#ifndef NDEBUG
-  debug_assert_shape(attn, {B, T, D});
-#endif
-
-  // shape: [B, T, D]
-  auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
-#ifndef NDEBUG
-  debug_assert_shape(proj, {B, T, D});
-#endif
-  return proj;
-}
-
-} // namespace native
-} // namespace at
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -102,9 +102,27 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  IntArrayRef mat1_sizes = mat1.sizes();
  IntArrayRef mat2_sizes = mat2.sizes();
  IntArrayRef self__sizes;
+  bool useLtInterface = false;
+  at::ScalarType scalar_type = self.scalar_type();
  c10::MaybeOwned<Tensor> self_;
  if (&result != &self) {
-    self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+    // Strangely, if mat2 has only 1 row or column, we get
+    // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+    // is to use lt interface only when self is bias.
+    useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+        result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+        self.is_contiguous() &&
+        (scalar_type == at::ScalarType::Double ||
+         scalar_type == at::ScalarType::Float ||
+         scalar_type == at::ScalarType::Half ||
+         scalar_type == at::ScalarType::BFloat16) &&
+        mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
+#endif
+    if (!useLtInterface) {
+      self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+    }
    self__sizes = self_->sizes();
  } else {
    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
@ -115,8 +133,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  }

  if (&result != &self) {
-    at::native::resize_output(result, self__sizes);
-    if (beta.toComplexDouble() != 0.0) {
+    at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
+    if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
      at::native::copy_(result, *self_);
    }
  }
@ -147,7 +165,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  int64_t mat1_ld = mat1_->stride((transpose_mat1 == transpose_result) ? 1 : 0);
  int64_t mat2_ld = mat2_->stride((transpose_mat2 == transpose_result) ? 1 : 0);
  int64_t result_ld = result_->stride(transpose_result ? 0 : 1);
-  at::ScalarType scalar_type = self_->scalar_type();

  if (mat1.numel() == 0) {
    // By definition, when beta==0, values in self should be ignored. nans and infs
@ -170,24 +187,61 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma

  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
-    using opmath_t = at::opmath_type<scalar_t>;
-    opmath_t alpha_val = alpha.to<opmath_t>();
-    opmath_t beta_val = beta.to<opmath_t>();
-    scalar_t* mat1_ptr = mat1_->data_ptr<scalar_t>();
-    scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
-    scalar_t* result_ptr = result_->data_ptr<scalar_t>();
-    at::cuda::blas::gemm<scalar_t>(
-      transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
-      transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
-      m, n, k,
-      alpha_val,
-      mat1_ptr, mat1_ld,
-      mat2_ptr, mat2_ld,
-      beta_val,
-      result_ptr, result_ld
-    );
-  });
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+  if (useLtInterface) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+          at::cuda::blas::gemm_and_bias<scalar_t>(
+              transpose_mat1,
+              transpose_mat2,
+              m,
+              n,
+              k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              mat1_->data_ptr<scalar_t>(),
+              mat1_ld,
+              mat2_->data_ptr<scalar_t>(),
+              mat2_ld,
+              self.data_ptr<scalar_t>(),
+              result_->data_ptr<scalar_t>(),
+              result_ld);
+        });
+  } else
+#endif
+  {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda",
+        [&] {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          scalar_t* mat1_ptr = mat1_->data_ptr<scalar_t>();
+          scalar_t* mat2_ptr = mat2_->data_ptr<scalar_t>();
+          scalar_t* result_ptr = result_->data_ptr<scalar_t>();
+          at::cuda::blas::gemm<scalar_t>(
+              transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n',
+              transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n',
+              m,
+              n,
+              k,
+              alpha_val,
+              mat1_ptr,
+              mat1_ld,
+              mat2_ptr,
+              mat2_ld,
+              beta_val,
+              result_ptr,
+              result_ld);
+        });
+  }
+
  if (!result.is_same(*result_)) {
    result.copy_(*result_);
  }
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@ -4,89 +4,9 @@
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>

-#if AT_MAGMA_ENABLED()
-#include <magma_types.h>
-#include <magma_v2.h>
-#endif
-
 namespace at {
 namespace native {

-#if AT_MAGMA_ENABLED()
-
-// RAII for a MAGMA Queue
-struct MAGMAQueue {
-
-  // Default constructor without a device will cause
-  // destroying a queue which has not been initialized.
-  MAGMAQueue() = delete;
-
-  // Constructor
-  explicit MAGMAQueue(int64_t device_id) {
-    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    // Magma operations is numerically sensitive, so TF32 should be off
-    // regardless of the global flag.
-    TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode));
-    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
-#endif
-    magma_queue_create_from_cuda(
-      device_id,
-      at::cuda::getCurrentCUDAStream(),
-      handle,
-      at::cuda::getCurrentCUDASparseHandle(),
-      &magma_queue_);
-  }
-
-  // Getter
-  magma_queue_t get_queue() const { return magma_queue_; }
-
-  // Destructor
-  ~MAGMAQueue() {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we
-    // should restore the original math mode back
-    cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_);
-    cublasSetMathMode(handle, original_math_mode);
-#endif
-    magma_queue_destroy(magma_queue_);
-  }
-
- private:
-  magma_queue_t magma_queue_;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  cublasMath_t original_math_mode;
-#endif
-};
-
-static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
-  auto result = static_cast<magma_int_t>(value);
-  if (static_cast<int64_t>(result) != value) {
-    AT_ERROR("magma: The value of ", varname, "(", (long long)value,
-             ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
-  }
-  return result;
-}
-
-// MAGMA functions that don't take a magma_queue_t aren't stream safe
-// Work around this by synchronizing with the default stream
-struct MagmaStreamSyncGuard {
-  MagmaStreamSyncGuard() {
-    auto stream = at::cuda::getCurrentCUDAStream();
-    if (stream != at::cuda::getDefaultCUDAStream()) {
-      at::cuda::stream_synchronize(stream);
-    }
-  }
-
-  ~MagmaStreamSyncGuard() noexcept(false) {
-    auto default_stream = at::cuda::getDefaultCUDAStream();
-    if (at::cuda::getCurrentCUDAStream() != default_stream) {
-      at::cuda::stream_synchronize(default_stream);
-    }
-  }
-};
-#endif
-
 static inline int cuda_int_cast(int64_t value, const char* varname) {
  auto result = static_cast<int>(value);
  TORCH_CHECK(static_cast<int64_t>(result) == value,
--- a/aten/src/ATen/native/cuda/attention.cu
+++ b/aten/src/ATen/native/cuda/attention.cu
@ -1,342 +0,0 @@
-#include <type_traits>
-
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/TensorAccessor.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/detail/KernelUtils.h>
-#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/MemoryAccess.cuh>
-#include <ATen/native/cuda/block_reduce.cuh>
-#include <ATen/native/cuda/PersistentSoftmax.cuh>
-
-#include <c10/cuda/CUDAMathCompat.h>
-
-namespace at {
-
-namespace native {
-
-namespace {
-
-Tensor gemm_nt(const Tensor& a, const Tensor& b) {
-  return at::native::matmul(a, b.t());
-}
-
-static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4;
-
-template <typename scalar_t, typename accscalar_t, bool assume_aligned>
-__global__ void transform_bias_rescale_qkv_kernel(
-    // [B, T, 3 * D]
-    const PackedTensorAccessor64<scalar_t, 3, RestrictPtrTraits> qkv,
-    // [3 * D]
-    const PackedTensorAccessor64<scalar_t, 1, RestrictPtrTraits> qkv_bias,
-    // [3, B, NH, T, DH]
-    PackedTensorAccessor64<scalar_t, 5, RestrictPtrTraits> q_k_v) {
-  // warp per DH.
-  // so launch B * NH * T warps.
-  auto NH = q_k_v.size(2);
-  auto T = q_k_v.size(3);
-  auto DH = q_k_v.size(4);
-
-  auto t = blockIdx.x % T;
-  auto b = blockIdx.x / T;
-
-  auto D = NH * DH;
-  const scalar_t sqrt_dim_per_head = std::sqrt(static_cast<scalar_t>(DH));
-
-  if (assume_aligned) {
-    constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC;
-    using LoadT = memory::aligned_vector<scalar_t, VEC>;
-    for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) {
-      auto d = d_v * VEC;
-      auto nh = d / DH;
-      auto dh = d % DH;
-      scalar_t qkv_bias_q[VEC];
-      scalar_t qkv_bias_k[VEC];
-      scalar_t qkv_bias_v[VEC];
-      scalar_t qkv_q[VEC];
-      scalar_t qkv_k[VEC];
-      scalar_t qkv_v[VEC];
-
-      // Here we require D % VEC == 0 for these vectorized loads.
-      *reinterpret_cast<LoadT*>(&qkv_bias_q) =
-        *reinterpret_cast<const LoadT*>(&qkv_bias[d + 0 * D]);
-      *reinterpret_cast<LoadT*>(&qkv_bias_k) =
-        *reinterpret_cast<const LoadT*>(&qkv_bias[d + 1 * D]);
-      *reinterpret_cast<LoadT*>(&qkv_bias_v) =
-        *reinterpret_cast<const LoadT*>(&qkv_bias[d + 2 * D]);
-
-      *reinterpret_cast<LoadT*>(&qkv_q) =
-        *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 0 * D]);
-      *reinterpret_cast<LoadT*>(&qkv_k) =
-        *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 1 * D]);
-      *reinterpret_cast<LoadT*>(&qkv_v) =
-        *reinterpret_cast<const LoadT*>(&qkv[b][t][d + 2 * D]);
-
-#pragma unroll
-      // TODO: specialize for float2half2/half2float2?
-      for (auto ii = 0; ii < VEC; ++ii) {
-        qkv_q[ii] = static_cast<scalar_t>(
-            (static_cast<accscalar_t>(qkv_q[ii]) +
-             static_cast<accscalar_t>(qkv_bias_q[ii])) /
-            static_cast<accscalar_t>(sqrt_dim_per_head));
-        qkv_k[ii] = static_cast<scalar_t>(
-            (static_cast<accscalar_t>(qkv_k[ii]) +
-             static_cast<accscalar_t>(qkv_bias_k[ii])));
-        qkv_v[ii] = static_cast<scalar_t>(
-            (static_cast<accscalar_t>(qkv_v[ii]) +
-             static_cast<accscalar_t>(qkv_bias_v[ii])));
-      }
-
-      // Here we require DH % VEC == 0 for these vectorized stores.
-      *reinterpret_cast<LoadT*>(&q_k_v[0][b][nh][t][dh]) =
-        *reinterpret_cast<const LoadT*>(&qkv_q);
-      *reinterpret_cast<LoadT*>(&q_k_v[1][b][nh][t][dh]) =
-        *reinterpret_cast<const LoadT*>(&qkv_k);
-      *reinterpret_cast<LoadT*>(&q_k_v[2][b][nh][t][dh]) =
-        *reinterpret_cast<const LoadT*>(&qkv_v);
-    }
-  } else {
-    // Same as above, but we can't vectorize memory access.
-    for (int32_t d = threadIdx.x; d < D; d += blockDim.x) {
-      auto nh = d / DH;
-      auto dh = d % DH;
-      scalar_t qkv_bias_q = qkv_bias[d + 0 * D];
-      scalar_t qkv_bias_k = qkv_bias[d + 1 * D];
-      scalar_t qkv_bias_v = qkv_bias[d + 2 * D];
-      scalar_t qkv_q = qkv[b][t][d + 0 * D];
-      scalar_t qkv_k = qkv[b][t][d + 1 * D];
-      scalar_t qkv_v = qkv[b][t][d + 2 * D];
-      qkv_q = static_cast<scalar_t>(
-          (static_cast<accscalar_t>(qkv_q) +
-           static_cast<accscalar_t>(qkv_bias_q)) /
-          static_cast<accscalar_t>(sqrt_dim_per_head));
-      qkv_k = static_cast<scalar_t>(
-          (static_cast<accscalar_t>(qkv_k) +
-           static_cast<accscalar_t>(qkv_bias_k)));
-      qkv_v = static_cast<scalar_t>(
-          (static_cast<accscalar_t>(qkv_v) +
-           static_cast<accscalar_t>(qkv_bias_v)));
-
-      q_k_v[0][b][nh][t][dh] = qkv_q;
-      q_k_v[1][b][nh][t][dh] = qkv_k;
-      q_k_v[2][b][nh][t][dh] = qkv_v;
-    }
-  }
-}
-
-// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
-std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv(
-    const Tensor& qkv,
-    const Tensor& qkv_bias,
-    const int64_t num_head) {
-  auto B = qkv.size(0);
-  auto T = qkv.size(1);
-  auto _3D = qkv.size(2);
-  auto D = _3D / 3;
-  TORCH_CHECK(D % num_head == 0);
-  const auto dim_per_head = D / num_head;
-  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options());
-#define CALL_KERNEL(assume_aligned)             \
-        transform_bias_rescale_qkv_kernel<scalar_t, accscalar_t, assume_aligned> \
-          <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(   \
-              qkv.packed_accessor64<scalar_t, 3, RestrictPtrTraits>(),  \
-              qkv_bias.packed_accessor64<scalar_t, 1, RestrictPtrTraits>(), \
-              q_k_v.packed_accessor64<scalar_t, 5, RestrictPtrTraits>())
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      qkv.scalar_type(),
-      "transform_bias_rescale_qkv",
-      [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-        auto threads = std::max(std::min<int32_t>(1024, D / TRANSFORM_BIAS_RESCALE_VEC), 1);
-        auto blocks = B * T;
-        if (dim_per_head % TRANSFORM_BIAS_RESCALE_VEC == 0) {
-          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-              D % TRANSFORM_BIAS_RESCALE_VEC == 0,
-              "D = num_heads * dim_per_head, so we should have dim_per_head % "
-              "TRANSFORM_BIAS_RESCALE_VEC == 0 => "
-              "D % TRANSFORM_BIAS_RESCALE_VEC == 0");
-          CALL_KERNEL(true);
-        } else {
-          CALL_KERNEL(false);
-        }
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      });
-#undef CALL_KERNEL
-  auto q_k_v_s =
-      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
-  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
-}
-
-Tensor bmm_nt(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  auto bt_ = b_.transpose(2, 1);
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, bt_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)});
-}
-
-template <typename T>
-__inline__ __device__ T WarpReduceMax(T val) {
-#pragma unroll
-  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
-    val = std::max(val, WARP_SHFL_DOWN(val, offset));
-  }
-  return val;
-}
-
-template <typename T>
-__inline__ __device__ T WarpReduceSum(T val) {
-#pragma unroll
-  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
-    val += WARP_SHFL_DOWN(val, offset);
-  }
-  return val;
-}
-
-void masked_softmax_dropout(
-    const Tensor& attn_scores,
-    const c10::optional<Tensor>& attn_mask) {
-  auto B = attn_scores.size(0);
-  auto num_heads = attn_scores.size(1);
-  auto T = attn_scores.size(2);
-  if (attn_mask) {
-    TORCH_CHECK(attn_mask->is_contiguous());
-  }
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      ScalarType::Half,
-      ScalarType::BFloat16,
-      attn_scores.scalar_type(),
-      "masked_softmax_dropout",
-      [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-        // TODO: proper implementation with masking.
-        dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, false, false>(
-          attn_scores.data_ptr<scalar_t>(),
-          attn_scores.data_ptr<scalar_t>(),
-          T,
-          T,
-          B * num_heads * T
-        );
-      });
-}
-
-Tensor bmm_nn(const Tensor& a, const Tensor& b) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
-  auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
-  // TODO: are these a single call to cublas batched matmul?
-  auto c_ = at::matmul(a_, b_);
-  return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
-}
-
-Tensor transform_0213(const Tensor& a) {
-  // TODO: check perf vs dedicated kernel.
-  return a.permute({0, 2, 1, 3})
-      .contiguous()
-      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
-}
-
-Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) {
-  auto a_ = a.view({a.size(0) * a.size(1), a.size(2)});
-  auto r_ = at::native::linear(a_, b, c);
-  return r_.view({a.size(0), a.size(1), r_.size(1)});
-}
-
-void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim());
-  for (auto idx : c10::irange(shape.size())) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]);
-  }
-}
-
-
-} // namespace
-std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_op_cuda(
-    const Tensor& qkv,
-    const Tensor& qkv_bias,
-    const int64_t num_head) {
-  auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone());
-}
-
-Tensor multi_head_self_attention_cuda(
-    const Tensor& query,
-    const Tensor& qkv_weight,
-    const Tensor& qkv_bias,
-    const Tensor& proj_weight,
-    const Tensor& proj_bias,
-    const int64_t num_head,
-    const c10::optional<Tensor>& mask) {
-  // query shape: [B, T, D]
-  // qkv_weight shape: [3 * D, D]
-
-  const auto D = query.sizes()[2];
-
-  TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor");
-  TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor");
-  TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query");
-  TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal");
-  TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head");
-
-#ifndef NDEBUG
-  const auto B = query.sizes()[0];
-  const auto T = query.sizes()[1];
-  const auto dim_per_head = D / num_head;
-#endif
-
-  // shape: [B, T, 3 x D]
-  auto qkv = gemm_nt(query, qkv_weight);
-#ifndef NDEBUG
-  debug_assert_shape(qkv, {B, T, 3 * D});
-#endif
-
-  // shape: 3 x [B, num_head, T, dim_per_head]
-  auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
-  const auto& q = std::get<0>(q_k_v);
-  const auto& k = std::get<1>(q_k_v);
-  const auto& v = std::get<2>(q_k_v);
-#ifndef NDEBUG
-  debug_assert_shape(q, {B, num_head, T, dim_per_head});
-  debug_assert_shape(k, {B, num_head, T, dim_per_head});
-  debug_assert_shape(v, {B, num_head, T, dim_per_head});
-#endif
-
-  // shape: [B, num_head, T, T]
-  auto qkt = bmm_nt(q, k);
-#ifndef NDEBUG
-  debug_assert_shape(qkt, {B, num_head, T, T});
-#endif
-
-  // shape: [B, num_head, T, T]
-  masked_softmax_dropout(qkt, mask);
-
-  // shape: [B, num_head, T, dim_per_head]
-  auto attn_ctx = bmm_nn(qkt, v);
-#ifndef NDEBUG
-  debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head});
-#endif
-
-  // shape: [B, T, D]
-  auto attn = transform_0213(attn_ctx);
-#ifndef NDEBUG
-  debug_assert_shape(attn, {B, T, D});
-#endif
-
-  // shape: [B, T, D]
-  auto proj = gemm_nt_bias(attn, proj_weight, proj_bias);
-#ifndef NDEBUG
-  debug_assert_shape(proj, {B, T, D});
-#endif
-
-  return proj;
-}
-
-} // namespace native
-} // namespace at
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -13,6 +13,7 @@
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
+#include <ATen/native/cuda/linalg/MagmaUtils.h>
 #include <ATen/native/cpu/zmath.h>

 #if AT_MAGMA_ENABLED()
--- a/aten/src/ATen/native/cuda/linalg/MagmaUtils.h
+++ b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h
@ -0,0 +1,88 @@
+#pragma once
+#include <ATen/cuda/CUDAConfig.h>
+
+#if AT_MAGMA_ENABLED()
+#include <magma_types.h>
+#include <magma_v2.h>
+#endif
+
+namespace at {
+namespace native {
+
+#if AT_MAGMA_ENABLED()
+
+// RAII for a MAGMA Queue
+struct MAGMAQueue {
+
+  // Default constructor without a device will cause
+  // destroying a queue which has not been initialized.
+  MAGMAQueue() = delete;
+
+  // Constructor
+  explicit MAGMAQueue(int64_t device_id) {
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+    // Magma operations is numerically sensitive, so TF32 should be off
+    // regardless of the global flag.
+    TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode));
+    TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+#endif
+    magma_queue_create_from_cuda(
+      device_id,
+      at::cuda::getCurrentCUDAStream(),
+      handle,
+      at::cuda::getCurrentCUDASparseHandle(),
+      &magma_queue_);
+  }
+
+  // Getter
+  magma_queue_t get_queue() const { return magma_queue_; }
+
+  // Destructor
+  ~MAGMAQueue() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+    // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we
+    // should restore the original math mode back
+    cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_);
+    cublasSetMathMode(handle, original_math_mode);
+#endif
+    magma_queue_destroy(magma_queue_);
+  }
+
+ private:
+  magma_queue_t magma_queue_;
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  cublasMath_t original_math_mode;
+#endif
+};
+
+static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
+  auto result = static_cast<magma_int_t>(value);
+  if (static_cast<int64_t>(result) != value) {
+    AT_ERROR("magma: The value of ", varname, "(", (long long)value,
+             ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
+  }
+  return result;
+}
+
+// MAGMA functions that don't take a magma_queue_t aren't stream safe
+// Work around this by synchronizing with the default stream
+struct MagmaStreamSyncGuard {
+  MagmaStreamSyncGuard() {
+    auto stream = at::cuda::getCurrentCUDAStream();
+    if (stream != at::cuda::getDefaultCUDAStream()) {
+      at::cuda::stream_synchronize(stream);
+    }
+  }
+
+  ~MagmaStreamSyncGuard() noexcept(false) {
+    auto default_stream = at::cuda::getDefaultCUDAStream();
+    if (at::cuda::getCurrentCUDAStream() != default_stream) {
+      at::cuda::stream_synchronize(default_stream);
+    }
+  }
+};
+#endif
+
+} // namespace native
+} // namespace at
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2549,16 +2549,6 @@
    CUDA: layer_norm_cuda
    CompositeImplicitAutograd: math_native_layer_norm

- func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, int num_head, Tensor? mask=None) -> Tensor
-  dispatch:
-    CPU: multi_head_self_attention_cpu
-    CUDA: multi_head_self_attention_cuda
-
- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_head) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CPU: transform_bias_rescale_qkv_op_cpu
-    CUDA: transform_bias_rescale_qkv_op_cuda
-
 - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
  dispatch:
    CPU: layer_norm_backward_cpu
@ -6066,7 +6056,7 @@
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
  variants: function, method

- func: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
+- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
  variants: function, method
  dispatch:
    CPU: scatter_reduce_two_cpu
--- a/aten/src/ATen/native/vulkan/glsl/tanh.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/tanh.glsl
@ -18,6 +18,10 @@ void main() {
  const ivec3 pos = ivec3(gl_GlobalInvocationID);

  if (all(lessThan(pos, uBlock.size.xyz))) {
-    imageStore(uOutput, pos, tanh(texelFetch(uInput, pos, 0)));
+    const vec4 intex = texelFetch(uInput, pos, 0);
+    imageStore(
+        uOutput,
+        pos,
+        tanh(clamp(intex, -15.0, 15.0)));
  }
 }
--- a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl
@ -17,6 +17,10 @@ void main() {
  const ivec3 pos = ivec3(gl_GlobalInvocationID);

  if (all(lessThan(pos, uBlock.size.xyz))) {
-    imageStore(uOutput, pos, tanh(imageLoad(uOutput, pos)));
+    const vec4 intex = imageLoad(uOutput, pos);
+    imageStore(
+        uOutput,
+        pos,
+        tanh(clamp(intex, -15.0, 15.0)));
  }
 }
--- a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp
@ -322,6 +322,13 @@ Tensor add_tensor(
    const Tensor& self_arg,
    const Tensor& other_arg,
    const Scalar& alpha) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        other_arg.item<float>(),
+        c10::optional<Scalar>(alpha.to<float>()),
+        VK_KERNEL(add_scalar));
+  }
  return arithmetic_tensor(
      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(add));
 }
@ -354,6 +361,13 @@ Tensor sub_tensor(
    const Tensor& self_arg,
    const Tensor& other_arg,
    const Scalar& alpha) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        other_arg.item<float>(),
+        c10::optional<Scalar>(-1 * alpha.to<float>()),
+        VK_KERNEL(add_scalar));
+  }
  return arithmetic_tensor(
      self_arg, other_arg, c10::optional<Scalar>(alpha), VK_KERNEL(sub));
 }
@ -374,6 +388,13 @@ Tensor& mul_scalar_(Tensor& self, const Scalar& other) {
 }

 Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        other_arg.item<float>(),
+        c10::optional<Scalar>(),
+        VK_KERNEL(mul_scalar));
+  }
  return arithmetic_tensor(
      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(mul));
 }
@ -400,6 +421,13 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) {
 }

 Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) {
+  if (other_arg.sizes().size() == 0) {
+    return arithmetic_scalar(
+        self_arg,
+        1.0 / other_arg.item<float>(),
+        c10::optional<Scalar>(),
+        VK_KERNEL(mul_scalar));
+  }
  return arithmetic_tensor(
      self_arg, other_arg, c10::optional<Scalar>(), VK_KERNEL(div));
 }
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@ -1551,7 +1551,7 @@ TEST(VulkanAPITest, tanh) {
    return;
  }

-  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
  const auto in_vulkan = in_cpu.vulkan();

  const auto out_cpu = at::tanh(in_cpu);
@ -1570,7 +1570,7 @@ TEST(VulkanAPITest, tanh_) {
    return;
  }

-  auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+  auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
  auto vulkan = cpu.vulkan();

  at::tanh_(cpu);
--- a/aten/src/ATen/test/vulkan_perf_test.cpp
+++ b/aten/src/ATen/test/vulkan_perf_test.cpp
@ -35,6 +35,58 @@ static void cat_op_channel_perf(benchmark::State& state) {
  }
 }

+static void gru_op_perf(benchmark::State& state) {
+  // Guard
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  // Arrange
+  const int H_in = static_cast<int>(state.range(0));  // input_size
+  const int H_out = static_cast<int>(state.range(1)); // hidden_size
+  const int num_layers = static_cast<int>(state.range(2));
+  const double gru_dropout = .0;
+  const bool has_biases = true;
+  const bool train = false;
+  const bool bidirectional = false;
+  const bool batch_first = true;
+  const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat));
+
+  c10::List<at::Tensor> weight_ih_l; // shape (3 * hidden_size, input_size)
+  c10::List<at::Tensor> weight_hh_l; // shape (3 * hidden_size, hidden_size)
+  c10::List<at::Tensor> bias_ih_l;   // shape (3 * hidden_size)
+  c10::List<at::Tensor> bias_hh_l;   // shape (3 * hidden_size)
+  for (int i = 0; i < num_layers; ++i) {
+    weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat)));
+    weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+    bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat)));
+  }
+
+  // put this guard here to run inference inststead of training
+  // to avoid the following error:
+  //     C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend.
+  //     If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present.
+  c10::InferenceMode mode;
+
+  // Act
+  while (state.KeepRunning()) {
+    // weights/biases should be always on CPU.
+    const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
+      weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
+      has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
+
+    auto vulkan_output = std::get<0>(out_vulkan);
+    auto vulkan_hidden = std::get<1>(out_vulkan);
+
+    // to avoid out-of-memory issues, release resources by waiting and flushing all GPU operations
+    at::native::vulkan::api::context()->wait(vulkan_output);
+    at::native::vulkan::api::context()->wait(vulkan_hidden);
+    at::native::vulkan::api::context()->flush();
+  }
+}
+
 static void CommonBenchmarkSettings(benchmark::internal::Benchmark* b) {
  b->Unit(benchmark::kMillisecond);
  b->ArgNames({"N", "C", "H", "W"});
@ -48,6 +100,7 @@ BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iter
 BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 4, 221, 193}); // small multiple of 4 channels
 BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 3, 221, 193}); // small non-multiple of 4 channels
 BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(3)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels (multi-thread)
+BENCHMARK(gru_op_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({384, 384, 2}); // McLaren Model inputs
 BENCHMARK_MAIN();

 #endif /* USE_VULKAN_API */
--- a/benchmarks/static_runtime/test_cpu_fusion.cc
+++ b/benchmarks/static_runtime/test_cpu_fusion.cc
@ -0,0 +1,83 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+#include <torch/torch.h>
+
+#include "test_utils.h"
+
+using namespace torch;
+using namespace torch::jit;
+using namespace torch::jit::test;
+
+TEST(CpuFusion, Simple) {
+  const auto simple_script = R"JIT(
+    def forward(self, a, b):
+        return (a + b).relu().tanh()
+  )JIT";
+
+  Module m("module");
+  m.define(simple_script);
+
+  StaticModuleOptions opts; // start with the defaults.
+  opts.enable_tensorexpr_fusion = true;
+
+  auto input1 = at::randn({2, 3});
+  auto input2 = at::ones({2, 3});
+
+  auto smodule = StaticModule(m, /* is_frozen */ false, opts, {input1, input2});
+  StaticRuntime runtime(smodule);
+
+  // Test with sample inputs
+  {
+    auto actual = runtime({input1, input2}, {});
+    auto expect = at::tanh(at::relu(input1 + input2));
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+
+  // Test with different inputs
+  {
+    auto new_input1 = at::randn({5, 14});
+    auto new_input2 = at::randn({5, 14});
+    auto actual = runtime({new_input1, new_input2}, {});
+    auto expect = at::tanh(at::relu(new_input1 + new_input2));
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+}
+
+TEST(CpuFusion, FallbackGraph) {
+  const auto simple_script = R"JIT(
+    def forward(self, a, b):
+        return (a + b).relu().tanh()
+  )JIT";
+
+  Module m("module");
+  m.define(simple_script);
+
+  StaticModuleOptions opts; // start with the defaults.
+  opts.enable_tensorexpr_fusion = true;
+
+  auto sample_input1 = at::randn({2, 3});
+  auto sample_input2 = at::ones({2, 3});
+  auto smodule = StaticModule(
+      m, /* is_frozen */ false, opts, {sample_input1, sample_input2});
+
+  StaticRuntime runtime(smodule);
+
+  // The sample inputs above were contiguous. Now, use a strided input
+  // to trigger running the fallback graph.
+  {
+    auto input1 = at::narrow(at::randn({2, 6}), 1, 0, 3);
+    auto input2 = at::ones({2, 3});
+    auto expect = at::tanh(at::relu(input1 + input2));
+    auto actual = runtime({input1, input2}, {});
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+
+  // Test with strided inputs of different size.
+  {
+    auto input1 = at::narrow(at::randn({10, 30}), 1, 0, 25);
+    auto input2 = at::randn({10, 25});
+    auto expect = at::tanh(at::relu(input1 + input2));
+    auto actual = runtime({input1, input2}, {});
+    EXPECT_TRUE(at::allclose(expect, actual.toTensor()));
+  }
+}
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@ -180,35 +180,48 @@ class vkRunner final : public Runner<T> {
  virtual c10::IValue run(
      T& module,
      const std::vector<c10::IValue>& inputs) override {
-    // Upload the input tensor(s) to GPU memory.
-    inputs_.clear();
-    inputs_.reserve(inputs.size());
-    for (const auto& input : inputs) {
-      if (input.isTensor()) {
-        inputs_.emplace_back(input.toTensor().vulkan());
-      }
-      else if (input.isList()) {
-        const c10::List<c10::IValue> input_as_list = input.toList();
-        c10::List<at::Tensor> input_vk_list;
-        input_vk_list.reserve(input_as_list.size());
-        for (int i=0; i < input_as_list.size(); ++i) {
-          const c10::IValue element = input_as_list.get(i);
-          if (element.isTensor()) {
-            input_vk_list.emplace_back(element.toTensor().vulkan());
-          }
-          else {
-            CAFFE_THROW("Input of type c10::List must only contain Tensors!");
-          }
+
+    if (inputs_.size() == 0) {
+      // Upload the input tensor(s) to GPU memory.
+      inputs_.clear();
+      inputs_.reserve(inputs.size());
+      for (const auto& input : inputs) {
+        if (input.isTensor()) {
+          inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan());
+        }
+        else if (input.isTensorList()) {
+          const c10::List<at::Tensor> input_as_list = input.toTensorList();
+          c10::List<at::Tensor> input_vk_list;
+          input_vk_list.reserve(input_as_list.size());
+          for (int i=0; i < input_as_list.size(); ++i) {
+            const at::Tensor element = input_as_list.get(i);
+            input_vk_list.emplace_back(at::rand(element.sizes()).vulkan());
+          }
+          inputs_.emplace_back(c10::IValue(input_vk_list));
+        }
+        else {
+          CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!");
        }
-        inputs_.emplace_back(c10::IValue(input_vk_list));
-      }
-      else {
-        CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::List!");
      }
    }

    // Run, and download the output tensor to system memory.
-    return module.forward(inputs_).toTensor().cpu();
+    c10::IValue output = module.forward(inputs_);
+    if (output.isTensor()) {
+      return output.toTensor().cpu();
+    }
+    else if (output.isTensorList()) {
+      return output.toTensorList().get(0).cpu();
+    }
+    else if (output.isList()) {
+      return output.toList().get(0).toTensor().cpu();
+    }
+    else if (output.isTuple()) {
+      return output.toTuple()->elements()[0].toTensor().cpu();
+    }
+    else {
+      CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!");
+    };
  }

 private:
--- a/caffe2/operators/bisect_percentile_op.h
+++ b/caffe2/operators/bisect_percentile_op.h
@ -44,7 +44,7 @@ class BisectPercentileOp final : public Operator<Context> {
        pct_upper_.size(),
        "Feature (raw) data and upper bound dimension should match.");
    n_features = pct_lens_.size();
-    index.reserve(n_features + 1);
+    index.resize(n_features + 1);
    index[0] = 0;
    for (int i = 1; i <= n_features; ++i) {
      index[i] = index[i - 1] + pct_lens_[i - 1];
@ -115,13 +115,10 @@ class BisectPercentileOp final : public Operator<Context> {
      int lo,
      int hi,
      float val) {
-    int mid;
-    bool low_cond, high_cond;
-
    while (lo < hi) {
-      mid = (lo + hi) >> 1;
-      low_cond = (data[mid] <= val);
-      high_cond = (val < data[mid + 1]);
+      const auto mid = lo + (hi - lo) / 2;
+      const bool low_cond = (data[mid] <= val);
+      const bool high_cond = (val < data[mid + 1]);
      if (low_cond && high_cond) {
        return mid;
      } else if (!low_cond) {
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@ -1,13 +1,16 @@
+import errno
 import os
 import shutil
 import tempfile
 import unittest
 from collections import namedtuple
+from typing import List

 import caffe2.python.hypothesis_test_util as htu
 import hypothesis.strategies as st
 import numpy as np
 import torch
+from torch import Tensor
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, test_util, workspace, model_helper, brew
 from hypothesis import given, settings
@ -783,8 +786,7 @@ class MyModule(torch.jit.ScriptModule):
        return x + y + z

    @torch.jit.script_method
-    def multi_input_tensor_list(self, tensor_list):  # pyre-ignore: PT type annotations
-        # type: (List[Tensor]) -> Tensor
+    def multi_input_tensor_list(self, tensor_list: List[Tensor]) -> Tensor:
        return tensor_list[0] + tensor_list[1] + tensor_list[2]

    @torch.jit.script_method
--- a/caffe2/serialize/versions.h
+++ b/caffe2/serialize/versions.h
@ -115,11 +115,13 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
 //  torchscript constant table. Also update tensor storage schema adapting to
 //  the unify format, the root key of tensor storage is updated from {index} to
 //  {the_pointer_value_the_tensor.storage}, for example:
-//  `140245072983168.storage` Forward-compatibility change. 0x6L: Implicit
-//  opereator versioning using number of specified argument. Refer to the
-//  summary of https://github.com/pytorch/pytorch/pull/56845 for details. 0x7L:
-//  Enable support for operators with default arguments plus out arguments.
-//  0x8L: Emit promoted operators as instructions
+//  `140245072983168.storage` Forward-compatibility change.
+//  0x6L: Implicit opereator versioning using number of specified argument.
+//  Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for details.
+//  0x7L: Enable support for operators with default arguments plus out arguments.
+//  Refer. See https://github.com/pytorch/pytorch/pull/63651 for details
+//  0x8L: Emit promoted operators as instructions.
+//  See https://github.com/pytorch/pytorch/pull/71662 for details
 constexpr uint64_t kProducedBytecodeVersion = 0x8L;

 // static_assert(
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -593,6 +593,7 @@ Tensor class reference
    Tensor.scatter_
    Tensor.scatter_add_
    Tensor.scatter_add
+    Tensor.scatter_reduce
    Tensor.select
    Tensor.select_scatter
    Tensor.set_
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -118,6 +118,7 @@ Indexing, Slicing, Joining, Mutating Ops
    select_scatter
    slice_scatter
    scatter_add
+    scatter_reduce
    split
    squeeze
    stack
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@ -5,6 +5,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <sstream>

@ -350,5 +351,52 @@ TEST(TEFuserPass, FuserPass_WhereList) {
  testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }

+TEST(TEFuserPass, DynamicShapeFusion) {
+  WithCPUFuser cf;
+  const auto graph_string = R"IR(
+    graph(%0 : Float(10, 5, strides=[5, 1], device=cpu),
+          %1 : Float(10, 5, strides=[5, 1], device=cpu)):
+      %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1)
+      %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1)
+      return (%3))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(
+      g,
+      /* min_group_size = */ 2,
+      /* add_composed_op = */ true,
+      /* fuse_to_dynamic_shapes = */ true);
+  Code code(g, "");
+
+  testing::FileCheck()
+      .check("prim::TensorExprDynamicGroup_")
+      ->check("prim::TensorExprDynamicGuard")
+      ->check("prim::TensorExprGroup_")
+      ->run(*g);
+
+  auto run_and_compare = [&](const std::vector<at::Tensor>& inputs) {
+    TORCH_INTERNAL_ASSERT(inputs.size() == 2);
+
+    auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]);
+
+    InterpreterState interp(code);
+    Stack stack(inputs.begin(), inputs.end());
+    interp.run(stack);
+    at::Tensor out = pop(stack).toTensor();
+    ASSERT_TRUE(at::allclose(out, ref));
+  };
+
+  std::vector<at::Tensor> inputs = {at::rand({10, 5}), at::rand({10, 5})};
+  run_and_compare(inputs);
+
+  std::vector<at::Tensor> inputs2 = {at::rand({20, 5}), at::rand({20, 5})};
+  run_and_compare(inputs2);
+
+  std::vector<at::Tensor> inputs3 = {at::rand({25, 60}), at::rand({25, 60})};
+  run_and_compare(inputs3);
+}
+
 } // namespace jit
 } // namespace torch
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@ -33,7 +33,7 @@ from torch.distributed.algorithms.join import Join, Joinable, JoinHook
 from torch.distributed.optim import ZeroRedundancyOptimizer
 from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim import SGD
+from torch.optim import SGD, AdamW
 from torch.testing._internal import common_distributed, common_utils
 from torch.testing._internal.common_utils import (
    TEST_WITH_ASAN,
@ -249,27 +249,54 @@ class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):

    def test_constructor(self):
        """Check the robustness of the ZeroRedundancyOptimizer constructor by
-        passing different values for `params`"""
+        passing different values for the ``params`` argument."""
        self.dist_init(self.rank)

-        m = torch.nn.Linear(1, 1)
-        # (input, expected error)
-        inputs = [
+        m = torch.nn.Sequential(
+            torch.nn.Linear(5, 10),
+            torch.nn.Linear(10, 10),
+            torch.nn.Linear(10, 10),
+        )
+
+        # Test various constructor inputs in the form: (input, expected error)
+        ctor_inputs = [
            ([], ValueError),                           # empty parameter list
            (torch.randn(1), TypeError),                # non-iterable: `torch.Tensor`
            (1.2, TypeError),                           # non-iterable: `float`
-            ([{"params": m.parameters()}], TypeError),  # iterable of dict
-            (list(m.parameters()) + [42], TypeError),   # iterable containing non-`torch.Tensor`
+            ([
+                {"params": [l.weight for l in m]},
+                {"params": [l.bias for l in m]},
+            ], None),                                   # iterable of dict
+            (list(m.parameters()) + [42], TypeError),   # iterable containing invalid type
            (m.parameters(), None),                     # `params` as a generator
            (list(m.parameters()), None)                # `params` as a list
        ]

-        for input, error in inputs:
-            if (error):
+        for ctor_input, error in ctor_inputs:
+            if error:
                with self.assertRaises(error):
-                    ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
+                    ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01)
            else:
-                ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1)
+                ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01)
+
+        # Test constructing with multiple parameter groups more thoroughly
+        weight_decay = 0.01
+        lr = 0.01
+        betas = (0.9, 0.999)
+        eps = 1e-8
+        params = [
+            {"params": [l.weight for l in m], "weight_decay": 0.},
+            {"params": [l.bias for l in m], "weight_decay": weight_decay},
+        ]
+        o = ZeroRedundancyOptimizer(
+            params, optimizer_class=AdamW,
+            lr=lr, betas=betas, eps=eps,
+        )
+        assert len(o.param_groups) == 2, \
+            f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
+        assert len(o.optim.param_groups) == 2, \
+            "Expected 2 local optimizer param groups, but got " \
+            f"{len(o.optim.param_groups)}"

    def test_same_dense_param_type(self):
        """Check that ZeroRedundancyOptimizer raises an exception if the input
@ -459,7 +486,76 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
        all_trainable()
        some_trainable()

+    @common_distributed.skip_if_no_gpu
+    def test_multiple_param_groups(self):
+        """
+        Tests parity between constructing ZeRO with multiple parameter groups
+        upfront versus adding parameter groups to ZeRO after construction
+        versus a non-sharded optimizer.
+        """
+        self.dist_init(self.rank)
+
+        model1 = torch.nn.Sequential(
+            torch.nn.Linear(5, 10),
+            torch.nn.Linear(10, 10),
+            torch.nn.Linear(10, 5),
+        )
+        model2 = copy.deepcopy(model1)
+        model3 = copy.deepcopy(model1)
+        model1 = model1.to(self.device)
+        model2 = model2.to(self.device)
+        model3 = model3.to(self.device)
+
+        batch_size = 8
+        num_iters = 3
+        inputs = [
+            torch.randn(batch_size, 5).to(self.device) for _ in range(num_iters)
+        ]
+        wd = 0.01
+        lr = 0.01
+        # Construct `optim1` with both parameter groups upfront
+        optim1 = ZeroRedundancyOptimizer(
+            [
+                {"params": [l.weight for l in model1], "weight_decay": 0.},
+                {"params": [l.bias for l in model1], "weight_decay": wd},
+            ],
+            optimizer_class=AdamW, lr=lr,
+        )
+        # Construct `optim2` by adding the second parameter after
+        optim2 = ZeroRedundancyOptimizer(
+            [l.weight for l in model2],
+            optimizer_class=AdamW, lr=lr, weight_decay=0.,
+        )
+        optim2.add_param_group(
+            {"params": [l.bias for l in model2], "weight_decay": wd}
+        )
+        # Construct `optim3` as a non-sharded optimizer
+        optim3 = AdamW(
+            [
+                {"params": [l.weight for l in model3], "weight_decay": 0.},
+                {"params": [l.bias for l in model3], "weight_decay": wd},
+            ], lr=lr,
+        )
+
+        # Check parity over a few iterations
+        for iter in range(num_iters):
+            for model, optim in (
+                (model1, optim1), (model2, optim2), (model3, optim3),
+            ):
+                optim.zero_grad()
+                out = model(inputs[iter])
+                loss = out.sum()
+                loss.backward()
+                optim.step()
+
+            for layer1, layer2, layer3 in zip(model1, model2, model3):
+                assert torch.allclose(layer1.weight, layer2.weight)
+                assert torch.allclose(layer1.weight, layer3.weight)
+                assert torch.allclose(layer1.bias, layer2.bias)
+                assert torch.allclose(layer1.bias, layer3.bias)
+
    @common_distributed.skip_if_lt_x_gpu(2)
+    @common_distributed.skip_if_rocm
    def test_collect_shards(self):
        """ Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer"""
        self.dist_init(self.rank)
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@ -106,7 +106,8 @@ ALLOW_LIST = [
    ("aten::_scatter_reduce", datetime.date(2022, 1, 31)),
    ("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)),
    ("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)),
-    ("aten::scatter_reduce.two", datetime.date(2022, 3, 15)),
+    ("aten::_transform_bias_rescale_qkv", datetime.date(9999, 1, 1)),
+    ("aten::_scatter_reduce.two", datetime.date(9999, 1, 1)),
 ]

 ALLOW_LIST_COMPILED = [
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@ -41,6 +41,7 @@ from collections import OrderedDict
 from torch.nn.utils.rnn import PackedSequence
 from torch.onnx import CheckerError, register_custom_op_symbolic, unregister_custom_op_symbolic
 from torch.onnx.symbolic_helper import _unimplemented
+from torch.onnx.utils import unpack_quantized_tensor


 def flatten_tuples(elem):
@ -108,9 +109,16 @@ def inline_flatten_list(inputs, res_list):
    return res_list


+def unpack_to_numpy(value):
+    value_unpacked = []
+    for value_ in value:
+        value_unpacked.extend(unpack_quantized_tensor(value_))
+    value_final = [to_numpy(v) for v in value_unpacked]
+    return value_final
+
+
 def run_ort(ort_sess, input):
-    input = flatten_tuples(input)
-    input = to_numpy(input)
+    input = unpack_to_numpy(flatten_tuples(input))
    ort_inputs = dict((ort_sess.get_inputs()[i].name, input) for i, input in enumerate(input))
    ort_outs = ort_sess.run(None, ort_inputs)
    return inline_flatten_list(ort_outs, [])
@ -118,7 +126,7 @@ def run_ort(ort_sess, input):

 def ort_compare_with_pytorch(ort_outs, output, rtol, atol):
    output, _ = torch.jit._flatten(output)
-    outputs = [to_numpy(outp) for outp in output]
+    outputs = unpack_to_numpy(output)

    # compare onnxruntime and PyTorch results
    assert len(outputs) == len(ort_outs), "number of outputs differ"
@ -5895,7 +5903,24 @@ class TestONNXRuntime(unittest.TestCase):
                return torch.pixel_shuffle(x, upscale_factor=2)

        x = torch.randn(2, 16, 4, 3, requires_grad=True)
+        y = torch.randn(4, 32, 8, 4, requires_grad=True)
        self.run_test(PixelShuffle(), x)
+        self.run_test(PixelShuffle(), x, input_names=["x"],
+                      dynamic_axes={"x": [0, 1, 2, 3]},
+                      test_with_inputs=[y])
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_pixel_unshuffle(self):
+        class PixelUnshuffle(torch.nn.Module):
+            def forward(self, x):
+                return torch.pixel_unshuffle(x, downscale_factor=2)
+
+        x = torch.randn(2, 16, 4, 6, requires_grad=True)
+        y = torch.randn(4, 32, 8, 4, requires_grad=True)
+        self.run_test(PixelUnshuffle(), x)
+        self.run_test(PixelUnshuffle(), x, input_names=["x"],
+                      dynamic_axes={"x": [0, 1, 2, 3]},
+                      test_with_inputs=[y])

    @skipIfUnsupportedMinOpsetVersion(9)
    def test_reciprocal(self):
@ -6924,6 +6949,128 @@ class TestONNXRuntime(unittest.TestCase):
        x = torch.randn(2, 3, 5, 5)
        self.run_test(Det(), x)

+    def test_linalg_norm(self):
+        class LinalgSingleDimModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgSingleDimModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord, dim=1)
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgSingleDimModel(None), x)
+        self.run_test(LinalgSingleDimModel(2), x)
+        self.run_test(LinalgSingleDimModel(float('inf')), x)
+        self.run_test(LinalgSingleDimModel(-float('inf')), x)
+        self.run_test(LinalgSingleDimModel(-4), x)
+        self.run_test(LinalgSingleDimModel(1.5), x)
+
+        class LinalgMultiDimModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgMultiDimModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord, dim=(0, 2))
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgMultiDimModel('fro'), x)
+        self.run_test(LinalgMultiDimModel(float('inf')), x)
+        self.run_test(LinalgMultiDimModel(-float('inf')), x)
+        self.run_test(LinalgMultiDimModel(1), x)
+        self.run_test(LinalgMultiDimModel(-1), x)
+
+        class LinalgNoDimNoOrdModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.linalg.norm(x)
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgNoDimNoOrdModel(), x)
+        y = torch.randn(2, 3)
+        self.run_test(LinalgNoDimNoOrdModel(), y)
+        z = torch.randn(2)
+        self.run_test(LinalgNoDimNoOrdModel(), z)
+
+        class LinalgNoDim1DModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgNoDim1DModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord)
+
+        x = torch.randn(2)
+        self.run_test(LinalgNoDim1DModel(None), x)
+        self.run_test(LinalgNoDim1DModel(2), x)
+        self.run_test(LinalgNoDim1DModel(float('inf')), x)
+        self.run_test(LinalgNoDim1DModel(-float('inf')), x)
+        self.run_test(LinalgNoDim1DModel(-4), x)
+        self.run_test(LinalgNoDim1DModel(1.5), x)
+
+        class LinalgNoDim2DModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgNoDim2DModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.norm(x, ord=self.ord)
+
+        x = torch.randn(2, 3)
+        self.run_test(LinalgNoDim2DModel('fro'), x)
+        self.run_test(LinalgNoDim2DModel(float('inf')), x)
+        self.run_test(LinalgNoDim2DModel(-float('inf')), x)
+        self.run_test(LinalgNoDim2DModel(1), x)
+        self.run_test(LinalgNoDim2DModel(-1), x)
+
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_linalg_vector_norm_zero(self):
+        class LinalgVectorNormModel(torch.nn.Module):
+            def __init__(self, ord_val):
+                super(LinalgVectorNormModel, self).__init__()
+                self.ord = ord_val
+
+            def forward(self, x):
+                return torch.linalg.vector_norm(x, ord=self.ord)
+
+        x = torch.randn(2, 3, 5, 5)
+        self.run_test(LinalgVectorNormModel(0), x)
+
+    def test_linalg_vector_norm(self):
+        class LinalgVectorNormModel(torch.nn.Module):
+            def __init__(self, ord_val, dim_info):
+                super(LinalgVectorNormModel, self).__init__()
+                self.ord = ord_val
+                self.dim, self.keepdim = dim_info
+
+            def forward(self, x):
+                return torch.linalg.vector_norm(x, ord=self.ord, dim=self.dim, keepdim=self.keepdim)
+
+        x = torch.randn(2, 3, 5, 5)
+        ord_options = [2, float('inf'), -float('inf'), -4, 1.5]
+        dim_options = [(None, False), (1, False), ((1, 2), False), ((1, 2), True)]
+        for ord_val in ord_options:
+            for dim_info in dim_options:
+                self.run_test(LinalgVectorNormModel(ord_val, dim_info), x)
+
+    def test_linalg_matrix_norm(self):
+        class LinalgMatrixNormModel(torch.nn.Module):
+            def __init__(self, ord_val, dim_val=(-2, -1), keepdim_val=False):
+                super(LinalgMatrixNormModel, self).__init__()
+                self.ord = ord_val
+                self.dim = dim_val
+                self.keepdim = keepdim_val
+
+            def forward(self, x):
+                return torch.linalg.matrix_norm(x, ord=self.ord, dim=self.dim, keepdim=self.keepdim)
+
+        x = torch.randn(2, 3, 5, 5)
+        ord_options = ['fro', float('inf'), -float('inf'), 1, -1]
+        for ord_val in ord_options:
+            self.run_test(LinalgMatrixNormModel(ord_val), x)
+            self.run_test(LinalgMatrixNormModel(ord_val, (0, 2)), x)
+            self.run_test(LinalgMatrixNormModel(ord_val, (0, 2), True), x)
+
    # This test checks output scalar type in the ONNX graph should not be null
    # https://github.com/pytorch/pytorch/issues/28607
    @skipIfUnsupportedMinOpsetVersion(10)
@ -10256,6 +10403,18 @@ class TestONNXRuntime(unittest.TestCase):
        loaded_model = onnx.load_from_string(f.getvalue())
        self.assertEqual(loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128)

+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_linear(self):
+        model = torch.nn.quantized.Linear(1, 2)
+        input = torch.rand(1, 1)
+        input_tensor = torch.quantize_per_tensor(input, 1, 0, torch.quint8)
+        # Currently, we need convert the model to ScriptModule before export.
+        # The reason is that PackedParams contains int (not tensor).
+        # Then it fails when the exporter calls _trace_and_get_graph_from_model().
+        # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1547858
+        self.run_test(torch.jit.trace(model, input_tensor), (input_tensor,))
+        self.run_test(torch.jit.script(model), (input_tensor,))
+
 def make_test(name, base, layer, bidirectional, initial_state,
              variable_length, dropout, script_test_min_opset_version,
              **extra_kwargs):
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@ -114,5 +114,42 @@ class TestONNXShapeInference(unittest.TestCase):
        slice = g.op("Slice", input, start_input, end, axis, step)
        self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None)))

+    def test_broadcast_matmul(self):
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 5, 1, 1)))
+
+        # test when first input is of rank 1
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 1, 1)))
+
+        # test when second input is of rank 1
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(2))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=(5, 1)))
+
+        # test when both inputs are of rank 1
+        g = self.create_empty_graph()
+        constant = self.insert_tensor_constant(g, torch.ones(2))
+        constant_2 = self.insert_tensor_constant(g, torch.ones(2))
+        shape = g.op("MatMul", constant, constant_2)
+        self.run_test(g, shape.node(), expect_tensor("Float", shape=()))
+
+    def test_expand(self):
+        g = self.create_empty_graph()
+        input = g.addInput()
+        constant = self.insert_tensor_constant(g, torch.ones(2, 4))
+        input.setType(constant.type().with_sizes([None, None]))
+        shape = g.op("Shape", input)
+        expand = g.op("Expand", constant, shape)
+        self.run_test(g, expand.node(), expect_tensor("Float", shape=(None, None)))
+
 if __name__ == '__main__':
    unittest.main()
--- a/test/quantization/dbr/test_quantize_dbr.py
+++ b/test/quantization/dbr/test_quantize_dbr.py
@ -853,9 +853,10 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
        qconfig = torch.quantization.default_qconfig
        self._test_auto_tracing(model_fp32, qconfig, (torch.randn(1, 1, 2, 2),))

-    @unittest.skip('this depends on unsupported syntax detection, currently disabled')
    def test_vovnet_sequential(self):
-
+        # We cannot quantize SequentialAppendList directly because
+        # AutoQuantizationStateModuleDict would appear in self.items.
+        # However, we can wrap it and quantize the wrapper.
        class SequentialAppendList(nn.Sequential):
            def __init__(self, *args):
                super(SequentialAppendList, self).__init__(*args)
@ -870,7 +871,16 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
                x = torch.cat(concat_list, dim=1)
                return x

-        m = SequentialAppendList(torch.nn.Conv2d(1, 1, 1)).eval()
+        class Wrapper(nn.Module):
+            def __init__(self, *args):
+                super().__init__()
+                self.append_list = SequentialAppendList(*args)
+
+            def forward(self, x):
+                x = self.append_list(x)
+                return x
+
+        m = Wrapper(torch.nn.Conv2d(1, 1, 1)).eval()
        qconfig = torch.quantization.default_qconfig
        self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 1, 1),))

@ -922,10 +932,11 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
            model_fp32, qconfig, (torch.randn(1, 1, 2, 2),),
            fuse_modules=False)

-    # this is broken because AutoQuantizationState appears in self.items
-    @unittest.skip('TODO fix this')
    def test_module_calls_items(self):
-        class M(torch.nn.ModuleDict):
+        # We cannot quantize M1 directly because
+        # AutoQuantizationStateModuleDict would appear in self.items.
+        # However, we can wrap it and quantize the wrapper.
+        class M1(torch.nn.ModuleDict):
            def __init__(self):
                super().__init__()
                for i in range(2):
@ -938,10 +949,22 @@ class TestQuantizeDBR(QuantizeDBRTestCase):
                    layers.append(layer(x))
                return torch.cat(layers, dim=1)

-        model_fp32 = M().eval()
+        class M2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.m1 = M1()
+
+            def forward(self, x):
+                x = self.m1(x)
+                return x
+
+        model_fp32 = M2().eval()
        qconfig = torch.quantization.default_qconfig
        self._test_auto_tracing(
-            model_fp32, qconfig, (torch.randn(1, 1, 2, 2),))
+            model_fp32, qconfig, (torch.randn(1, 1, 2, 2),),
+            # TODO(future PR): implement observer sharing for torch.cat
+            # in DBR quant, to ensure that numerical behavior matches
+            do_fx_comparison=False)

    def test_subclass_of_quantizeable_module(self):
        """
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@ -3,7 +3,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.quantized as nnq
-import torch.nn.quantized._reference as nnqr
 from torch.nn.utils.rnn import PackedSequence
 from torch.ao.quantization import (
    quantize,
@ -75,130 +74,6 @@ import unittest
 import numpy as np

 class TestQuantizeEagerOps(QuantizationTestCase):
-    def _test_reference_module_impl(self,
-                                    float_module_class,
-                                    quantized_module_class,
-                                    extra_module_kwargs,
-                                    input_size):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = float_module_class(**extra_module_kwargs)
-                self.quant = QuantStub()
-                self.dequant = DeQuantStub()
-
-            def forward(self, x):
-                x = self.quant(x)
-                x = self.conv(x)
-                x = self.dequant(x)
-                return x
-
-        class RefM(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = float_module_class(**extra_module_kwargs)
-                self.quant1 = QuantStub()
-                self.dequant1 = DeQuantStub()
-                self.quant2 = QuantStub()
-                self.dequant2 = DeQuantStub()
-
-            def forward(self, x):
-                x = self.quant1(x)
-                x = self.dequant1(x)
-                x = self.conv(x)
-                x = self.quant2(x)
-                x = self.dequant2(x)
-                return x
-
-        qengine = 'fbgemm'
-        with override_quantized_engine(qengine):
-            data = torch.randn(*input_size, dtype=torch.float)
-            original_m = M()
-            original_ref_m = RefM()
-
-            original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach())
-            original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach())
-
-            original_m.qconfig = torch.quantization.default_qconfig
-
-            m = prepare(original_m)
-            # calibration
-            m(data)
-            m = convert(m)
-            # check if the module is properly quantized
-            self.assertEqual(type(m.quant), nnq.Quantize)
-            self.assertEqual(type(m.conv), quantized_module_class)
-            self.assertEqual(type(m.dequant), nnq.DeQuantize)
-            res = m(data)
-
-            # quantize the reference model
-            original_ref_m.eval()
-            original_ref_m.qconfig = torch.quantization.default_qconfig
-
-            ref_m = prepare(original_ref_m)
-            ref_m(data)
-            reference_module_mapping = {
-                QuantStub: nnq.Quantize,
-                DeQuantStub: nnq.DeQuantize,
-                nn.Conv1d: nnqr.Conv1d,
-                nn.Conv2d: nnqr.Conv2d,
-                nn.Conv3d: nnqr.Conv3d,
-                nn.ConvTranspose1d: nnqr.ConvTranspose1d,
-                nn.ConvTranspose2d: nnqr.ConvTranspose2d,
-                nn.ConvTranspose3d: nnqr.ConvTranspose3d,
-            }
-            ref_m = convert(ref_m, mapping=reference_module_mapping)
-            ref_res = ref_m(data)
-            self.assertEqual(res, ref_res)
-
-    def test_conv_1d(self):
-        self._test_reference_module_impl(
-            nn.Conv1d,
-            nnq.Conv1d,
-            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
-            (16, 1, 1)
-        )
-
-    def test_conv_2d(self):
-        self._test_reference_module_impl(
-            nn.Conv2d,
-            nnq.Conv2d,
-            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
-            (16, 1, 10, 10)
-        )
-
-    def test_conv_3d(self):
-        self._test_reference_module_impl(
-            nn.Conv3d,
-            nnq.Conv3d,
-            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
-            (16, 1, 10, 10, 10)
-        )
-
-    def test_conv_transpose_1d(self):
-        self._test_reference_module_impl(
-            nn.ConvTranspose1d,
-            nnq.ConvTranspose1d,
-            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
-            (16, 1, 1)
-        )
-
-    def test_conv_transpose_2d(self):
-        self._test_reference_module_impl(
-            nn.ConvTranspose2d,
-            nnq.ConvTranspose2d,
-            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
-            (16, 1, 10, 10)
-        )
-
-    def test_conv_transpose_3d(self):
-        self._test_reference_module_impl(
-            nn.ConvTranspose3d,
-            nnq.ConvTranspose3d,
-            {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
-            (16, 1, 10, 10, 10)
-        )
-
    def _test_activation_op_impl(
            self, float_module_class, quantized_module_class, extra_module_kwargs):
        """ Implementation for testing common activation ops like leaky relu
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@ -1,5 +1,6 @@
 # Owner(s): ["oncall: quantization"]

+import copy
 import math
 import torch
 import torch.nn as nn
@ -10,6 +11,7 @@ from torch.nn.modules.utils import _pair
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
 import torch.nn.qat as nnqat
+import torch.nn.intrinsic.qat as nniqat
 import torch.nn.qat.dynamic as nnqatd
 from torch.ao.quantization import (
    prepare,
@ -984,6 +986,43 @@ class TestQuantizeEagerQATNumerics(QuantizationTestCase):
            qat_op_optim.step()
            qat_ref_op_optim.step()

+    @override_qengines
+    def test_linear_bn_numerics(self):
+        qengine = torch.backends.quantized.engine
+        m_ref = nn.Sequential(
+            nn.Linear(4, 4),
+            nn.BatchNorm1d(4),
+        )
+        m_ref_copy = copy.deepcopy(m_ref)
+        m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']])
+        qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+        m_ref_copy[0].qconfig = qconfig
+        m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
+
+        # without fake_quants, fused QAT module should match fp32 module
+        m.apply(torch.quantization.disable_fake_quant)
+        data = torch.randn(4, 4)
+        r1 = m_ref(data)
+        r2 = m(data)
+        self.assertTrue(torch.allclose(r1, r2))
+
+    @override_qengines
+    def test_linear_bn_workflow(self):
+        qengine = torch.backends.quantized.engine
+        m = nn.Sequential(
+            QuantStub(),
+            nn.Linear(4, 4),
+            nn.BatchNorm1d(4),
+        )
+        data = torch.randn(4, 4)
+        m.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+        m = torch.ao.quantization.fuse_modules_qat(m, [['1', '2']])
+        mp = prepare_qat(m)
+        mp(data)
+        mq = convert(mp)
+        self.assertTrue(type(mq[1]) == nnq.Linear)
+        self.assertTrue(type(mq[2]) == nn.Identity)
+
 if __name__ == '__main__':
    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                       "\tpython test/test_quantization.py TESTNAME\n\n"
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -17533,51 +17533,12 @@ class TestNNDeviceType(NNTestCase):
            )
        self.assertEqual(output_non_contig, output_contig)

-
    @onlyCUDA
    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
    def test_embedding_bag_bfloat16(self, device, dtypes):
        self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
        self._test_EmbeddingBag(device, 'mean', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)

-    @dtypesIfCUDA(torch.float)
-    @dtypes(torch.float)
-    def test_transform_bias_rescale_qkv(self, device, dtype):
-        # TODO: debug CPU test failure with settings (48, 4, 16, 8) and add that mode
-        tests = [
-            (64, 4, 16, 8),
-            # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8
-            (24, 2, 4, 2),
-            # Make sure CUDA can handle small input sizes
-            (2, 2, 2, 2),
-            # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4, causes alignment issues
-            (24, 4, 4, 2)
-        ]
-        for (embed_dim, num_heads, sl, bs) in tests:
-            x = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype) * 10
-            qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
-
-            with torch.no_grad():
-                (q, k, v) = torch._transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias, num_head=num_heads)
-
-                def simple_transform_bias_rescale_qkv(qkv, bias):
-                    (q, k, v) = torch.split(qkv, embed_dim, dim=-1)
-                    (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1)
-                    return tuple(
-                        x.reshape((sl, bs, num_heads, embed_dim // num_heads)).transpose(2, 1)
-                        for x in (
-                                (q + q_bias) / math.sqrt(embed_dim // num_heads),
-                                (k + k_bias),
-                                (v + v_bias)
-                        )
-                    )
-                correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias)
-
-            self.assertEqual(q.size(), correct_q.size())
-            self.assertTrue(torch.allclose(q, correct_q))
-            self.assertTrue(torch.allclose(k, correct_k))
-            self.assertTrue(torch.allclose(v, correct_v))
-
    @onlyCUDA
    @dtypes(torch.half, torch.float, torch.double)
    def test_multihead_attention_dtype(self, device, dtype):
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -5773,7 +5773,7 @@ class TestTorch(TestCase):

        for reduce in reduces:
            for dim in range(len(shape)):
-                output = input._scatter_reduce(dim, index, reduce, output_size=output_size)
+                output = input.scatter_reduce(dim, index, reduce, output_size=output_size)

                # Check that output is of the correct size
                output_shape = copy.copy(shape)
@ -5807,16 +5807,16 @@ class TestTorch(TestCase):
                self.assertTrue(torch.allclose(output, expected))

        with self.assertRaisesRegex(RuntimeError, "Expected `dim` to be in range -3 to 2"):
-            torch._scatter_reduce(input, 4, index, "sum")
+            torch.scatter_reduce(input, 4, index, "sum")

        with self.assertRaisesRegex(RuntimeError, "Shape mismatch"):
            index2 = torch.randint(0, output_size, (10, ), dtype=torch.long, device=device)
-            torch._scatter_reduce(input, 0, index2, "sum")
+            torch.scatter_reduce(input, 0, index2, "sum")

        with self.assertRaisesRegex(RuntimeError, "Expected `index` values to be in range 0 to 2"):
            input2 = torch.randn(10, dtype=dtype, device=device)
            index2 = torch.tensor([0, 1, 0, 1, 2, 3, 3, 4, 4, 3])
-            torch._scatter_reduce(input2, 0, index2, "sum", output_size=2)
+            torch.scatter_reduce(input2, 0, index2, "sum", output_size=2)

    def test_structseq_repr(self):
        a = torch.arange(250).reshape(5, 5, 10)
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@ -1 +1 @@
-Subproject commit 365abe3ee878b2592e9a33f937d96df0048d99dd
+Subproject commit ab3ca6647d3f4be25423c5f997256a8a219fb762
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@ -2595,6 +2595,6 @@
 - name: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  output_differentiability: [False]

- name: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
+- name: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
  self: scatter_reduce_backward(grad, self, dim, index, reduce, result)
  index: non_differentiable
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -1176,7 +1176,6 @@ aten_native_source_non_codegen_list = [
    "aten/src/ATen/native/quantized/library.cpp",
    "aten/src/ATen/quantized/QTensorImpl.cpp",
    "aten/src/ATen/quantized/Quantizer.cpp",
-    "aten/src/ATen/native/attention.cpp",
    "aten/src/ATen/native/Activation.cpp",
    "aten/src/ATen/native/AdaptiveAveragePooling.cpp",
    "aten/src/ATen/native/AdaptiveAveragePooling3d.cpp",
--- a/tools/clang_format_hash/linux64/clang-format-linux64
+++ b/tools/clang_format_hash/linux64/clang-format-linux64
@ -1 +1 @@
-21ca53c291a88b53dac85751b7a0203ca610ac94b7adaff3c092cf30df4168f2
+e1c8b97b919541a99e0a355df5c3f9e8abebc64259dbee6f8c68e1ef90582856
--- a/tools/clang_format_hash/mac/clang-format-mojave
+++ b/tools/clang_format_hash/mac/clang-format-mojave
@ -1 +1 @@
-5fde7bccf65032da297dfb1f18e4a95e96e278fa397e9dcaf364dfe23ec46353
+1485a242a96c737ba7cdd9f259114f2201accdb46d87ac7a8650b1a814cd4d4d
--- a/tools/stats/test_history.py
+++ b/tools/stats/test_history.py
@ -193,50 +193,45 @@ In multiline mode, each line next includes the name of a CircleCI job,
 followed by the time of the specified test in that job at that commit.
 Example:

-    $ tools/stats/test_history.py --mode=multiline --ref=594a66 --sha-length=8 --test=test_set_dir \
-      --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test
-    2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc5_4_test 0.36s
-    2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc7_test 0.573s errored
-    2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc5_4_test 0.819s
-    2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc7_test 0.449s
-    2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc5_4_test 0.361s
-    2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc7_test 0.454s
-    2021-02-10 10:09:10Z 2e35fe95 (no reports in S3)
-    2021-02-10 10:09:07Z ff73be7e (no reports in S3)
-    2021-02-10 10:05:39Z 74082f0d (no reports in S3)
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.414s
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.476s
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.377s
-    2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.326s
+    $ tools/stats/test_history.py --mode=multiline --ref=86a961af879 --sha-length=8 \
+      --test=test_composite_compliance_dot_cpu_float32 \
+      --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
+    2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc5.4-test-default1 0.001s
+    2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc7-test-default1 0.001s
+    2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
+    2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc7-test-default1 0.001s
+    2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
+    2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc7-test-default1 0.001s
+    2022-02-18 13:14:56Z e73eaffd (no reports in S3)
+    2022-02-18 06:29:12Z 710f12f5 linux-xenial-py3.7-gcc5.4-test-default1 0.001s

 Another multiline example, this time with the --all flag:

-    $ tools/stats/test_history.py --mode=multiline --all --ref=321b9 --delta=12 --sha-length=8 \
-      --test=test_qr_square_many_batched_complex_cuda
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 424.284s
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 402.572s
-    2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.164s
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 436.732s
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 407.616s
-    2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.044s
+    $ tools/stats/test_history.py --mode=multiline --all --ref=86a961af879 --delta=12 --sha-length=8 \
+      --test=test_composite_compliance_dot_cuda_float32
+    2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-slow1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 linux-xenial-cuda11.3-py3.7-gcc7-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 periodic-linux-bionic-cuda11.5-py3.7-gcc7-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test-default1 0.001s skipped
+    2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test-default1 0.001s skipped

 In columns mode, the name of the job isn't printed, but the order of the
 columns is guaranteed to match the order of the jobs passed on the
 command line. Example:

-    $ tools/stats/test_history.py --mode=columns --ref=3cf783 --sha-length=8 --test=test_set_dir \
-      --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test
-    2021-02-10 12:18:50Z 3cf78395    0.644s    0.312s
-    2021-02-10 11:13:34Z 594a66d7    0.360s  errored
-    2021-02-10 10:13:25Z 9c0caf03    0.819s    0.449s
-    2021-02-10 10:09:14Z 602434bc    0.361s    0.454s
-    2021-02-10 10:09:10Z 2e35fe95
-    2021-02-10 10:09:07Z ff73be7e
-    2021-02-10 10:05:39Z 74082f0d
-    2021-02-10 07:42:29Z 0620c96f    0.414s    0.377s (2 job re-runs omitted)
-    2021-02-10 07:27:53Z 33afb5f1    0.381s    0.294s
+    $ tools/stats/test_history.py --mode=columns --ref=86a961af879 --sha-length=8 \
+      --test=test_composite_compliance_dot_cpu_float32 \
+      --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
+    2022-02-18 15:47:37Z 86a961af    0.001s    0.001s
+    2022-02-18 15:12:34Z f5e201e4    0.001s    0.001s
+    2022-02-18 13:14:56Z 1c0df265    0.001s    0.001s
+    2022-02-18 13:14:56Z e73eaffd
+    2022-02-18 06:29:12Z 710f12f5    0.001s    0.001s
+    2022-02-18 05:20:30Z 51b04f27    0.001s    0.001s
+    2022-02-18 03:49:46Z 69389fb5    0.001s    0.001s
+    2022-02-18 00:19:12Z 056b6260    0.001s    0.001s
+    2022-02-17 23:58:32Z 39fb7714    0.001s    0.001s

 Minor note: in columns mode, a blank cell means that no report was found
 in S3, while the word "absent" means that a report was found but the
--- a/tools/test/test_test_history.py
+++ b/tools/test/test_test_history.py
@ -53,6 +53,7 @@ def parse_description(description: str) -> List[Example]:
    return examples


+@unittest.skip("Skipping as this test is fragile, issue #73083")
 class TestTestHistory(unittest.TestCase):
    maxDiff = None

--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -326,7 +326,8 @@ def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
 def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
 def _jit_pass_onnx_unpack_quantized_weights(
    graph: Graph,
-    paramsDict: Dict[str, IValue]
+    paramsDict: Dict[str, IValue],
+    caffe2: _bool
 ) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_quantization_insert_permutes(
    graph: Graph,
@ -409,7 +410,7 @@ def _import_ir_module_from_package(
 ) -> ScriptModule: ...

 def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
-def _check_onnx_proto(proto: str) -> None: ...
+def _check_onnx_proto(proto: str, full_check: _bool = False) -> None: ...
 def _propagate_and_assign_input_shapes(
    graph: Graph,
    inputs: Tuple[Tensor, ...],
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -3374,6 +3374,12 @@ Example::

 """.format(**reproducibility_notes))

+add_docstr_all('scatter_reduce', r"""
+scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor
+
+See :func:`torch.scatter_reduce`
+""")
+
 add_docstr_all('select',
               r"""
 select(dim, index) -> Tensor
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -8547,6 +8547,59 @@ scatter_add(input, dim, index, src) -> Tensor
 Out-of-place version of :meth:`torch.Tensor.scatter_add_`
 """)

+add_docstr(torch.scatter_reduce, r"""
+scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor
+
+Reduces all values from the :attr:`input` tensor to the indices specified in
+the :attr:`index` tensor. For each value in :attr:`input`, its output index is
+specified by its index in :attr:`input` for ``dimension != dim`` and by the
+corresponding value in :attr:`index` for ``dimension = dim``.
+The applied reduction for non-unique indices is defined via the :attr:`reduce`
+argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`).
+For non-existing indices, the output will be filled with the identity of the
+applied reduction (1 for :obj:`"prod"` and 0 otherwise).
+
+It is also required that ``index.size(d) == input.size(d)`` for all dimensions ``d``.
+Moreover, if :attr:`output_size` is defined the the values of :attr:`index` must be
+between ``0`` and ``output_size - 1`` inclusive.
+
+
+For a 3-D tensor with :obj:`reduce="sum"`, the output is given as::
+
+    out[index[i][j][k]][j][k] += input[i][j][k]  # if dim == 0
+    out[i][index[i][j][k]][k] += input[i][j][k]  # if dim == 1
+    out[i][j][index[i][j][k]] += input[i][j][k]  # if dim == 2
+
+Note:
+    This out-of-place operation is similar to the in-place versions of
+    :meth:`~torch.Tensor.scatter_` and :meth:`~torch.Tensor.scatter_add_`,
+    in which the output tensor is automatically created according to the
+    maximum values in :attr:`index` and filled based on the identity of the
+    applied reduction.
+
+Note:
+    {forward_reproducibility_note}
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and reduce.
+    src (Tensor): the source elements to scatter and reduce
+    reduce (str): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+    output_size (int, optional): the size of the output at dimension :attr:`dim`.
+        If set to :obj:`None`, will get automatically inferred according to
+        :obj:`index.max() + 1`
+
+Example::
+
+    >>> input = torch.tensor([1, 2, 3, 4, 5, 6])
+    >>> index = torch.tensor([0, 1, 0, 1, 2, 1])
+    >>> torch.scatter_reduce(input, 0, index, reduce="sum", output_size=3)
+    tensor([4, 12, 5])
+
+""".format(**reproducibility_notes))
+
 add_docstr(torch.select,
           r"""
 select(input, dim, index) -> Tensor
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@ -80,6 +80,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
            nnqatd.Linear,
            nnqd.Linear,
            nniqat.LinearReLU,
+            nniqat.LinearBn1d,
            nn.modules.linear.NonDynamicallyQuantizableLinear,
        ]),
        # linear functionals
@ -572,6 +573,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
        nniqat.ConvReLU2d,
        nniqat.ConvReLU3d,
        nniqat.LinearReLU,
+        nniqat.LinearBn1d,
        nniqd.LinearReLU,
    ])

--- a/torch/ao/quantization/_dbr/auto_trace.py
+++ b/torch/ao/quantization/_dbr/auto_trace.py
@ -14,6 +14,8 @@ from .utils import (
    get_torch_function_hook_type,
    get_module_hook_type,
    OpQuantizeabilityType,
+    AutoQuantizationStateModuleDict,
+    get_fqn_valid_for_module_dict_key,
 )
 from .model_utils import (
    pack_weights_for_functionals,
@ -350,6 +352,8 @@ def add_auto_observation(
                            for _, child_child in child.named_modules():
                                leaves.add(child_child)

+                    self._fqn_to_auto_quant_state_map = AutoQuantizationStateModuleDict()
+
                    for fqn, v in named_modules:

                        # fqn is the global FQN, i.e. 'foo.bar.baz'
@ -366,14 +370,39 @@ def add_auto_observation(
                        if v is self:
                            # for the top level module only, specify input
                            # and output dtypes
-                            v._auto_quant_state = AutoQuantizationState(
+                            auto_quant_state = AutoQuantizationState(
                                qconfig_dict, fqn,
                                input_dtypes, output_dtypes)
-                            pass
                        else:
-                            v._auto_quant_state = AutoQuantizationState(
+                            auto_quant_state = AutoQuantizationState(
                                qconfig_dict, fqn)

+                        # The code below registers the auto_quant_state object
+                        # of the child in the module hierarchy of the parent,
+                        # and adds the auto_quant_state object to the child
+                        # with a raw __setattr__, without registering it in
+                        # the module hierarchy of the child.
+                        # This is solving the problem of both storing extra state
+                        # (observers) as well as not modifying the meaning of user
+                        # code in child modules which iterates over all module
+                        # children.
+                        #
+                        # This narrows down the issue of dynamically adding
+                        # children to only affect the top level module and not
+                        # the children.
+
+                        # On the parent, register this module in the FQN map
+                        fqn_to_use_for_key = \
+                            get_fqn_valid_for_module_dict_key(fqn)
+                        self._fqn_to_auto_quant_state_map[fqn_to_use_for_key] = \
+                            auto_quant_state
+                        # On the child, manually set the attribute without
+                        # going through the `torch.nn.Module.__setattr__`
+                        # function, to prevent this object from appearing in
+                        # the child's module hierarchy.
+                        object.__setattr__(
+                            v, '_auto_quant_state', auto_quant_state)
+
                global_op_idx[0] = 0

                output = super().__call__(*new_args, **new_kwargs)
@ -688,6 +717,6 @@ def add_auto_convert(module : torch.nn.Module) -> torch.nn.Module:
 # checking the fix into `torch.nn.Sequential` to avoid the patch.
 def _nn_sequential_patched_forward(cls, input):
    for module in cls:
-        if not isinstance(module, AutoQuantizationState):
+        if not isinstance(module, AutoQuantizationStateModuleDict):
            input = module(input)
    return input
--- a/torch/ao/quantization/_dbr/auto_trace_rewriter.py
+++ b/torch/ao/quantization/_dbr/auto_trace_rewriter.py
@ -8,7 +8,10 @@ import torch
 import torch.fx
 from .mappings import conv_ops
 from .quantization_state import AutoQuantizationState
-from .utils import get_packable_arg_idxs
+from .utils import (
+    get_packable_arg_idxs,
+    AutoQuantizationStateModuleDict,
+)

 class AllModuleTracer(torch.fx.Tracer):
    """
@ -207,7 +210,7 @@ class AllModuleTracer(torch.fx.Tracer):
    # class.
    # TODO(future): remove the hack
    def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any:
-        if isinstance(m, AutoQuantizationState):
+        if isinstance(m, AutoQuantizationStateModuleDict):
            return args[0]
        return super().call_module(m, forward, args, kwargs)

--- a/torch/ao/quantization/_dbr/utils.py
+++ b/torch/ao/quantization/_dbr/utils.py
@ -583,10 +583,9 @@ def get_torch_function_hook_type(
    # the direct __dict__ accesses are for performance, because
    # the default `torch.nn.Module.__getattr__` has overhead.
    parent_module_has_qstate = parent_module is not None and \
-        '_modules' in parent_module.__dict__ and \
-        '_auto_quant_state' in parent_module.__dict__['_modules']
+        '_auto_quant_state' in parent_module.__dict__
    needs_op_hooks = parent_module_has_qstate and \
-        parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(func)  # type: ignore[union-attr, operator]
+        parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(func)  # type: ignore[union-attr, operator]

    if needs_op_hooks:
        return HookType.OP_HOOKS
@ -608,17 +607,15 @@ def get_module_hook_type(
    if cached_hook_type is not None:
        return cached_hook_type
    parent_module_has_qstate = parent_module is not None and \
-        '_modules' in parent_module.__dict__ and \
-        '_auto_quant_state' in parent_module.__dict__['_modules']
+        '_auto_quant_state' in parent_module.__dict__
    needs_op_hooks = parent_module_has_qstate and \
-        parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(cur_module)  # type: ignore[union-attr, operator]
+        parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(cur_module)  # type: ignore[union-attr, operator]
    # We need IO hooks if
    # * we are calling forward on a module (always True here)
    # * that module has quant state
    # * that module does not need op hooks for the parent
    needs_io_hooks = (
-        '_modules' in cur_module.__dict__ and
-        '_auto_quant_state' in cur_module.__dict__['_modules'] and
+        '_auto_quant_state' in cur_module.__dict__ and
        (not needs_op_hooks)
    )
    needs_arg_dequants = parent_module_has_qstate and not needs_op_hooks
@ -727,3 +724,18 @@ def get_cur_qconfig(
        qconfig_dict, cur_op_type, cur_fqn, global_qconfig)

    return qconfig
+
+
+# We store quantization state for all children on the top level module in a
+# ModuleDict. In order to properly special case this module from other
+# ModuleDict instances, we create a marker class for it.
+class AutoQuantizationStateModuleDict(torch.nn.ModuleDict):
+    pass
+
+def get_fqn_valid_for_module_dict_key(fqn: str) -> str:
+    """
+    Modifies `fqn` to make it a valid key to a ModuleDict.
+    """
+    if fqn == '':
+        fqn = ' '
+    return fqn.replace('.', ':')
--- a/torch/ao/quantization/_quantize_dbr.py
+++ b/torch/ao/quantization/_quantize_dbr.py
@ -82,6 +82,8 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None,
        for v in parents_to_delete_auto_quant_state:
            del v._auto_quant_state

+        del model._fqn_to_auto_quant_state_map
+
        # the model hierarchy might have changed during fusion, so we
        # have to delete the cached module hook types
        for k, v in model.named_modules():
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@ -114,7 +114,12 @@ def fuse_linear_bn(is_qat, linear, bn):
    if is_qat:
        # TODO: remove the assert later
        assert linear.training, "qat is only supported when linear.training is True currently"
-        raise Exception("Fusing Linear+BatchNorm not yet supported in training.")
+        assert bn.num_features == linear.out_features,\
+            "Output features of Linear must match num_features of BatchNorm1d"
+        assert bn.affine, "Only support fusing BatchNorm1d with affine set to True"
+        assert bn.track_running_stats,\
+            "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+        return nni.LinearBn1d(linear, bn)
    else:
        return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)

--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@ -77,6 +77,7 @@ DEFAULT_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
    nniqat.ConvReLU2d: nniq.ConvReLU2d,
    nniqat.ConvReLU3d: nniq.ConvReLU3d,
    nniqat.LinearReLU: nniq.LinearReLU,
+    nniqat.LinearBn1d: nnq.Linear,
    # QAT modules:
    nnqat.Linear: nnq.Linear,
    nnqat.Conv2d: nnq.Conv2d,
@ -99,6 +100,7 @@ DEFAULT_QAT_MODULE_MAPPINGS : Dict[Callable, Any] = {
    nni.ConvReLU2d: nniqat.ConvReLU2d,
    nni.ConvReLU3d: nniqat.ConvReLU3d,
    nni.LinearReLU: nniqat.LinearReLU,
+    nni.LinearBn1d: nniqat.LinearBn1d,
 }

 # Default map for swapping dynamic modules
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@ -16,7 +16,7 @@ from torch.ao.quantization.quantization_mappings import (
    _has_special_act_post_process,
    _get_special_act_post_process,
 )
-from .utils import get_qparam_dict
+
 from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
 from torch.ao.quantization.qconfig import (
    add_module_to_qconfig_obs_ctr,
@ -565,15 +565,7 @@ def swap_module(mod, mapping, custom_module_class_mapping):
            new_mod = custom_module_class_mapping[type(mod)].from_observed(mod)
            swapped = True
        elif type(mod) in mapping:
-            qmod = mapping[type(mod)]
-            if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE:
-                assert mod.qconfig is not None
-                weight_post_process = mod.qconfig.weight()
-                weight_post_process(mod.weight)
-                weight_qparams = get_qparam_dict(weight_post_process)
-                new_mod = qmod.from_float(mod, weight_qparams)
-            else:
-                new_mod = qmod.from_float(mod)
+            new_mod = mapping[type(mod)].from_float(mod)
            swapped = True

        if swapped:
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -589,11 +589,10 @@ PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs)
 {
  auto qengines = at::globalContext().supportedQEngines();
  auto list = THPObjectPtr(PyList_New(qengines.size()));
+  if (!list) return nullptr;
  for (const auto i : c10::irange(qengines.size())) {
    PyObject *i64 = THPUtils_packInt64(static_cast<int>(qengines[i]));
-    if (!i64) {
-      throw python_error();
-    }
+    if (!i64) return nullptr;
    PyList_SET_ITEM(list.get(), i, i64);
  }
  return list.release();
@ -607,22 +606,18 @@ PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs)

 PyObject *THPModule_setDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs)
 {
-  try {
-    at::globalContext().setDefaultMobileCPUAllocator();
-  } catch (c10::Error& e) {
-    THPUtils_setError(e.what());
-  }
+  HANDLE_TH_ERRORS
+  at::globalContext().setDefaultMobileCPUAllocator();
  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
 }

 PyObject *THPModule_unsetDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs)
 {
-  try {
-    at::globalContext().unsetDefaultMobileCPUAllocator();
-  } catch (c10::Error& e) {
-    THPUtils_setError(e.what());
-  }
+  HANDLE_TH_ERRORS
+  at::globalContext().unsetDefaultMobileCPUAllocator();
  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
 }

 static PyObject * THPModule_vmapmode_increment_nesting(PyObject* _unused, PyObject *arg) {
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -155,6 +155,19 @@ struct OpEventData {
    torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
 };

+struct MemoryEventData {
+  int64_t start_time;
+  void* ptr;
+  int64_t alloc_size;
+  int64_t total_allocated;
+  int64_t total_reserved;
+  uint64_t threadID;
+  torch::profiler::impl::kineto::DeviceAndResource kineto_info;
+  c10::DeviceType device_type;
+  c10::DeviceIndex device_index;
+};
+static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
+
 // Assumption: Total threads number will not exceed 2^16-1, and total ops will
 // not exceed 2^48 -1.
 static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
@ -204,29 +217,16 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
      int64_t total_reserved,
      c10::Device device) override {
    if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
-      std::lock_guard<std::mutex> guard(state_mutex_);
-      auto start_time = getTimeUs();
-      if (cpu_trace_) {
-        torch::profiler::impl::kineto::recordThreadInfo();
-        cpu_trace_.addMemoryUsageActivity(
-            kMemoryEventName,
-            torch::profiler::impl::kineto::kineto_ids(),
-            start_time,
-            device,
-            ptr,
-            alloc_size,
-            total_allocated,
-            total_reserved);
-      }
-
-      kineto_events_.emplace_back();
-      auto& evt = kineto_events_.back();
-      evt.name(kMemoryEventName)
-          .startUs(start_time)
-          .deviceIndex(device.index())
-          .deviceType(device.type())
-          .nBytes(alloc_size)
-          .startThreadId(at::RecordFunction::currentThreadId());
+      memory_events_.push_back(
+          {getTimeUs(),
+           ptr,
+           alloc_size,
+           total_allocated,
+           total_reserved,
+           at::RecordFunction::currentThreadId(),
+           torch::profiler::impl::kineto::kineto_ids(),
+           device.type(),
+           device.index()});
    }
  }

@ -264,6 +264,28 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {

  void materializeOpEvents() {
    std::lock_guard<std::mutex> guard(state_mutex_);
+
+    for (const auto& e : memory_events_) {
+        cpu_trace_.addMemoryUsageActivity(
+            kMemoryEventName,
+            e.kineto_info,
+            e.start_time,
+            c10::Device(e.device_type, e.device_index),
+            e.ptr,
+            e.alloc_size,
+            e.total_allocated,
+            e.total_reserved);
+
+      kineto_events_.emplace_back();
+      auto& evt = kineto_events_.back();
+      evt.name(kMemoryEventName)
+          .startUs(e.start_time)
+          .deviceIndex(e.device_index)
+          .deviceType(e.device_type)
+          .nBytes(e.alloc_size)
+          .startThreadId(e.threadID);
+    }
+
    for (const auto& e : op_events_) {
      if (e.end_us_ < e.start_us_) {
        // We initialize end_us_ to the smallest int64_t, so this means that
@ -406,7 +428,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
        py_event_indices_{
            { nullptr,
              std::string("null") }};
-    for (size_t i = 0; i < py_events.size(); i++) {
+    for (const auto i : c10::irange(py_events.size())) {
      py_event_indices_.insert({py_events[i].get(), std::to_string(i)});
    }

@ -585,6 +607,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
  uint64_t start_time_;
  std::set<torch::profiler::impl::ActivityType> activities_;
  std::deque<OpEventData> op_events_;
+  std::deque<MemoryEventData> memory_events_;
  torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
  std::vector<KinetoEvent> kineto_events_;
  // Optional, if event post-processing is enabled.
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@ -833,8 +833,7 @@ void gather(

  if (cur_rank == root)
  {
-    for (int r = 0; r < numranks; r++)
-    {
+    for (const auto r : c10::irange(numranks)) {
      if (r != root) {
        auto* recvbuff =  reinterpret_cast<char*>(outputs[r].data_ptr());
        NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
@ -874,8 +873,7 @@ void scatter(
  NCCL_CHECK(ncclGroupStart());
  if (cur_rank == root)
  {
-    for (int r = 0; r < numranks; r++)
-    {
+    for (const auto r : c10::irange(numranks)) {
      if (r != root) {
        size_t send_count = inputs[r].numel();
        auto send_type = to_nccl_data_type(inputs[r]);
--- a/torch/csrc/deploy/remove_dt_needed.cpp
+++ b/torch/csrc/deploy/remove_dt_needed.cpp
@ -10,6 +10,7 @@
 #include <iostream>
 #include <vector>

+#include <c10/util/irange.h>
 #include <fmt/format.h>

 #define ERROR(msg_fmt, ...) \
@ -47,7 +48,7 @@ int main(int argc, const char** argv) {
  auto program_headers = (Elf64_Phdr*)(data + header->e_phoff);
  auto n_program_headers = header->e_phnum;
  Elf64_Dyn* dynamic = nullptr;
-  for (size_t i = 0; i < n_program_headers; ++i) {
+  for (const auto i : c10::irange(n_program_headers)) {
    const Elf64_Phdr* phdr = &program_headers[i];
    if (phdr->p_type == PT_DYNAMIC) {
      dynamic = reinterpret_cast<Elf64_Dyn*>(data + phdr->p_offset);
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@ -650,11 +650,13 @@ Example::
          .def(
              "get",
              [](::c10d::Store& store, const std::string& key) -> py::bytes {
-                auto value = store.get(key);
+                auto value = [&]() {
+                  py::gil_scoped_release guard;
+                  return store.get(key);
+                }();
                return py::bytes(
                    reinterpret_cast<char*>(value.data()), value.size());
              },
-              py::call_guard<py::gil_scoped_release>(),
              R"(
 Retrieves the value associated with the given ``key`` in the store. If ``key`` is not
 present in the store, the function will wait for ``timeout``, which is defined
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@ -147,7 +147,7 @@ c10::optional<at::Tensor> runTorchSlice_opset10(
      return c10::nullopt;
    }
    auto axes_a = inputTensorValues[3].accessor<int64_t, 1>();
-    axes.reserve(inputTensorValues[3].sizes()[0]);
+    axes.resize(inputTensorValues[3].sizes()[0]);
    // ONNX slice accepts negative axis, fix this for aten op
    for (const auto i : c10::irange(inputTensorValues[3].sizes()[0])) {
      axes[i] = axes_a[i] < 0 ? axes_a[i] + inputTensorValues[0].sizes().size()
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@ -61,5 +61,12 @@ Node* transformToONNXConcatNode(
    bool need_new_input,
    int opset_version);

+class ScalarTypeHashFunction {
+ public:
+  size_t operator()(const c10::ScalarType& type) const {
+    return static_cast<size_t>(type);
+  }
+};
+
 } // namespace jit
 } // namespace torch
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@ -761,6 +761,25 @@ static void fuseListConstructListUnpack(Block* b) {
  }
 }

+// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+static void eraseTupleConstruct(Block* block) {
+  size_t index = 0;
+  // TupleConstruct is generated from the symbolics in quantized domain, and
+  // consumed by other quantized operators. The remained TupleConstruct should
+  // be at the output of the blocks.
+  for (auto* output : block->outputs()) {
+    auto output_node = output->node();
+    if (output_node->kind() == prim::TupleConstruct) {
+      block->eraseOutput(index);
+      size_t input_index = 0;
+      for (auto* input : output_node->inputs()) {
+        block->insertOutput(index + (input_index++), input);
+      }
+    }
+    index++;
+  }
+}
+
 void removeMaxPoolUnusedOutput(Block* b) {
  for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
    auto n = *it;
@ -1025,6 +1044,7 @@ void PeepholeOptimizeONNX(
  fuseListConstructListUnpack(graph->block());
  fuseLogSoftmaxNllLoss(graph->block());
  eraseListConstruct(graph->block(), opset_version);
+  eraseTupleConstruct(graph->block());
  EliminateDeadCode(
      graph->block(),
      true,
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@ -1,6 +1,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/onnx/scalar_type_analysis.h>

 namespace torch {
@ -11,13 +12,6 @@ using namespace ::c10::onnx;
 }

 namespace {
-class ScalarTypeHashFunction {
- public:
-  size_t operator()(const c10::ScalarType& type) const {
-    return static_cast<size_t>(type);
-  }
-};
-
 const int ONNX_OPSET_14 = 14;

 static const std::unordered_map<c10::ScalarType, int, ScalarTypeHashFunction>
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@ -702,54 +702,59 @@ void SetShapeValueFromListConstructNode(Node* lc_node) {
  }
 }

+std::vector<::c10::ShapeSymbol> Broadcast(const std::vector<::c10::ShapeSymbol> &input_shape_value_0,
+                                          const std::vector<::c10::ShapeSymbol> &input_shape_value_1) {
+  size_t rank_0 = input_shape_value_0.size();
+  size_t rank_1 = input_shape_value_1.size();
+  size_t rank_max = std::max(rank_0, rank_1);
+  size_t rank_min = std::min(rank_0, rank_1);
+  std::vector<::c10::ShapeSymbol> final_shape;
+  final_shape.reserve(rank_max);
+  for (auto idx = 0; idx < rank_max; idx++) {
+    final_shape.emplace_back(::c10::ShapeSymbol::newSymbol());
+  }
+  for (auto idx = 0; idx < rank_min; idx++) {
+    const c10::ShapeSymbol& ss_shape_0 =
+        input_shape_value_0[rank_0 - 1 - idx];
+    const c10::ShapeSymbol& ss_shape_1 =
+        input_shape_value_1[rank_1 - 1 - idx];
+    bool is_static_0 = ss_shape_0.is_static();
+    bool is_static_1 = ss_shape_1.is_static();
+    if (is_static_0 && is_static_1) {
+      int64_t static_0_sz = ss_shape_0.static_size();
+      int64_t static_1_sz = ss_shape_1.static_size();
+      final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
+          std::max(static_0_sz, static_1_sz));
+    } else if (!is_static_0 && !is_static_1) {
+      if (ss_shape_0.value() == ss_shape_1.value()) {
+        final_shape[rank_max - 1 - idx] = ss_shape_0;
+      }
+    }
+  }
+
+  if (rank_0 < rank_1) {
+    for (size_t idx = rank_min; idx < rank_max; idx++) {
+      size_t shape_idx = rank_max - 1 - idx;
+      final_shape[shape_idx] = input_shape_value_1[shape_idx];
+    }
+  } else {
+    for (size_t idx = rank_min; idx < rank_max; idx++) {
+      size_t shape_idx = rank_max - 1 - idx;
+      final_shape[shape_idx] = input_shape_value_0[shape_idx];
+    }
+  }
+  return final_shape;
+}
+
 void ProcessBroadcastNode(Node* n) {
  TORCH_INTERNAL_ASSERT(n->inputs().size() == 2);
  if (ConstantValueMap::HasShape(n->input(0)->debugName()) &&
      ConstantValueMap::HasShape(n->input(1)->debugName())) {
    auto input_shape_0 = ConstantValueMap::GetShape(n->input(0)->debugName());
-    auto input_shape_value_0 = input_shape_0.value().sizes();
+    auto input_shape_value_0 = input_shape_0.value().sizes().value();
    auto input_shape_1 = ConstantValueMap::GetShape(n->input(1)->debugName());
-    auto input_shape_value_1 = input_shape_1.value().sizes();
-    size_t rank_0 = input_shape_value_0.value().size();
-    size_t rank_1 = input_shape_value_1.value().size();
-    size_t rank_max = std::max(rank_0, rank_1);
-    size_t rank_min = std::min(rank_0, rank_1);
-    std::vector<::c10::ShapeSymbol> final_shape;
-    final_shape.reserve(rank_max);
-    for (auto idx = 0; idx < rank_max; idx++) {
-      final_shape.emplace_back(::c10::ShapeSymbol::newSymbol());
-    }
-    for (auto idx = 0; idx < rank_min; idx++) {
-      const c10::ShapeSymbol& ss_shape_0 =
-          input_shape_value_0.value()[rank_0 - 1 - idx];
-      const c10::ShapeSymbol& ss_shape_1 =
-          input_shape_value_1.value()[rank_1 - 1 - idx];
-      bool is_static_0 = ss_shape_0.is_static();
-      bool is_static_1 = ss_shape_1.is_static();
-      if (is_static_0 && is_static_1) {
-        int64_t static_0_sz = ss_shape_0.static_size();
-        int64_t static_1_sz = ss_shape_1.static_size();
-        final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
-            std::max(static_0_sz, static_1_sz));
-      } else if (!is_static_0 && !is_static_1) {
-        if (ss_shape_0.value() == ss_shape_1.value()) {
-          final_shape[rank_max - 1 - idx] = ss_shape_0;
-        }
-      }
-    }
-
-    if (rank_0 < rank_1) {
-      for (auto idx = rank_min; idx < rank_max; idx++) {
-        auto shape_idx = rank_max - 1 - idx;
-        final_shape[shape_idx] = input_shape_value_1.value()[shape_idx];
-      }
-    } else {
-      for (auto idx = rank_min; idx < rank_max; idx++) {
-        auto shape_idx = rank_max - 1 - idx;
-        final_shape[shape_idx] = input_shape_value_0.value()[shape_idx];
-      }
-    }
-
+    auto input_shape_value_1 = input_shape_1.value().sizes().value();
+    auto final_shape = Broadcast(input_shape_value_0, input_shape_value_1);
    UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
  }
 }
@ -857,6 +862,8 @@ void ProcessMatMulNode(Node* n) {
    auto input_shape_value_1 = input_shape_1.sizes().value();
    size_t rank_0 = input_shape_value_0.size();
    size_t rank_1 = input_shape_value_1.size();
+    // Handle inputs of rank 1 just like numpy.matmul:
+    // https://numpy.org/doc/stable/reference/generated/numpy.matmul.html
    auto is_rank_0_1 = false;
    if (rank_0 == 1) {
      input_shape_value_0.insert(
@ -870,25 +877,20 @@ void ProcessMatMulNode(Node* n) {
      rank_1 = 2;
      is_rank_1_1 = true;
    }
-    size_t rank = std::max(rank_0, rank_1);
-    std::vector<::c10::ShapeSymbol> final_shape;
-    final_shape.reserve(rank);
-    if (rank_0 >= rank_1) {
-      for (auto idx = 0; idx < rank_0 - 2; idx++) {
-        final_shape.emplace_back(input_shape_value_0[idx]);
-      }
-    } else {
-      for (auto idx = 0; idx < rank_1 - 2; idx++) {
-        final_shape.emplace_back(input_shape_value_1[idx]);
-      }
+    // Per https://pytorch.org/docs/stable/generated/torch.matmul.html
+    // the broadcasting logic only applies to the batch dimensions, and not the matrix dimensions
+    // so we remove the matrix dimensions which are the last 2 dimensions before broadcasting
+    auto final_shape = Broadcast(
+      std::vector<::c10::ShapeSymbol>(input_shape_value_0.begin(), input_shape_value_0.end() - 2),
+      std::vector<::c10::ShapeSymbol>(input_shape_value_1.begin(), input_shape_value_1.end() - 2)
+    );
+    // add the last 2 dimensions back, unless they do not exist in the first place and inserted by this function
+    // Then apply [n,k]X[k,m]=[n,m], where n=input_shape_value_0[rank_0 - 2], m=input_shape_value_1[rank_1 - 1]
+    if (!is_rank_0_1) {
+      final_shape.emplace_back(input_shape_value_0[rank_0 - 2]);
    }
-    final_shape.emplace_back(input_shape_value_0[rank_0 - 2]);
-    final_shape.emplace_back(input_shape_value_1[rank_1 - 1]);
-    if (is_rank_0_1) {
-      final_shape.erase(final_shape.begin());
-    }
-    if (is_rank_1_1) {
-      final_shape.pop_back();
+    if (!is_rank_1_1) {
+      final_shape.emplace_back(input_shape_value_1[rank_1 - 1]);
    }
    UpdateShape(n->output(0), c10::SymbolicShape(final_shape));
  }
@ -1374,6 +1376,8 @@ void ComputeConstant(Node* n, int opset_version) {
        if (input0_shape_size.has_value()) {
          auto input0_shape_value = input0_shape_size.value();
          if (ConstantValueMap::HasValue(n->input(1)->debugName())) {
+            // When value of `shape` is statically known,
+            // output shape can be computed.
            auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector(
                n->input(1)->debugName());
            auto final_shape =
@ -1381,6 +1385,23 @@ void ComputeConstant(Node* n, int opset_version) {
            if (final_shape.has_value()) {
              UpdateShape(n->output(), final_shape.value());
            }
+          } else if (
+              auto expand_shape =
+                  ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown(
+                      n->input(1)->debugName())) {
+            // When shape of `shape` is statically known,
+            // output rank can be computed.
+            TORCH_INTERNAL_ASSERT(
+                expand_shape.value().size() == 1,
+                "`Shape` input to `Expand` should be a 1-D tensor. Instead got rank ",
+                expand_shape.value().size());
+            if (expand_shape.value()[0] > 0) {
+              std::vector<c10::ShapeSymbol> final_shape;
+              for (const auto i : c10::irange(expand_shape.value()[0])) {
+                final_shape.emplace_back(c10::ShapeSymbol::newSymbol());
+              }
+              UpdateShape(n->output(), c10::SymbolicShape(final_shape));
+            }
          }
        }
      }
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@ -9,12 +9,9 @@
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>

-#ifndef AT_PER_OPERATOR_HEADERS
+// TODO: Switch to per operator headers after
+// https://github.com/pytorch/pytorch/pull/68693 is merged
 #include <ATen/Functions.h>
-#else
-#include <ATen/ops/quantize_per_tensor.h>
-#include <ATen/ops/zeros.h>
-#endif

 #include <stack>

@ -104,7 +101,7 @@ double getScaleFromInput(Node* input_node) {
      input_name);
 }

-Node* CreateQuantizedWeights(
+Node* CreateQuantizedWeightsCaffe2(
    std::string data,
    std::shared_ptr<Graph>& graph,
    std::vector<int64_t> shapes,
@ -118,7 +115,7 @@ Node* CreateQuantizedWeights(
  return const_node;
 }

-Node* CreateQuantizedBias(
+Node* CreateQuantizedBiasCaffe2(
    std::vector<int64_t> data,
    std::shared_ptr<Graph>& graph,
    std::vector<int64_t> shapes,
@ -132,6 +129,62 @@ Node* CreateQuantizedBias(
  return const_node;
 }

+std::vector<Node*> CreateQuantizedWeights(
+    std::vector<float> data,
+    std::shared_ptr<Graph>& graph,
+    std::vector<int64_t> shapes,
+    float scale,
+    int64_t zero_point) {
+  Node* const_node_1 = graph->create(prim::Constant);
+  auto const_value =
+      at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat)
+          .to(at::kCPU);
+  auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
+  at::Tensor const_value_copy = at::empty(c10::IntArrayRef(shapes), options);
+  const_value.copy_(const_value);
+  const_node_1->t_(Symbol::attr("value"), const_value_copy);
+
+  Node* const_node_2 = graph->create(prim::Constant);
+  std::vector<float> scale_v{scale};
+  std::vector<int64_t> scale_shapes{1};
+  auto const_shape =
+      at::from_blob(scale_v.data(), c10::IntArrayRef(scale_shapes), at::kFloat)
+          .to(at::kCPU);
+  at::Tensor const_shape_copy =
+      at::empty(c10::IntArrayRef(scale_shapes), options);
+  const_shape_copy.copy_(const_shape);
+  const_node_2->t_(Symbol::attr("value"), const_shape_copy);
+
+  Node* const_node_3 = graph->create(prim::Constant);
+  std::vector<int64_t> zero_point_v{zero_point};
+  std::vector<int64_t> zero_shapes{1};
+  auto const_zero =
+      at::from_blob(
+          zero_point_v.data(), c10::IntArrayRef(zero_shapes), at::kInt)
+          .to(at::kCPU);
+  at::Tensor const_zero_copy =
+      at::empty(c10::IntArrayRef(zero_shapes), options);
+  const_zero_copy.copy_(const_zero);
+  const_node_3->t_(Symbol::attr("value"), const_zero_copy);
+
+  return {const_node_1, const_node_2, const_node_3};
+}
+
+Node* CreateQuantizedBias(
+    std::vector<float> data,
+    std::shared_ptr<Graph>& graph,
+    std::vector<int64_t> shapes) {
+  Node* const_node_1 = graph->create(prim::Constant);
+  auto const_bias =
+      at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat)
+          .to(at::kCPU);
+  auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU);
+  at::Tensor const_bias_copy = at::empty(c10::IntArrayRef(shapes), options);
+  const_bias_copy.copy_(const_bias);
+  const_node_1->t_(Symbol::attr("value"), const_bias_copy);
+  return const_node_1;
+}
+
 Node* createIntTuple(
    const std::vector<int64_t>& is,
    std::shared_ptr<Graph>& graph) {
@ -158,7 +211,8 @@ void unpackQuantizedWeightsHelper(
    std::map<std::string, IValue>& paramsDict,
    const std::string& pattern,
    const std::string& unpack_fn,
-    QuantizedParamsType params_type) {
+    QuantizedParamsType params_type,
+    bool caffe2 = true) {
  Graph pattern_graph;
  std::unordered_map<std::string, Value*> vmap;
  parseIR(pattern, &pattern_graph, vmap);
@ -368,26 +422,47 @@ void unpackQuantizedWeightsHelper(
    const int64_t weight_zp = unpacked_weight.q_zero_point() + 128;
    const int64_t wt_numel = unpacked_weight.numel();

-    // Create caffe2::Int8GivenTensorFill node
-    std::ostringstream os;
-    for (const auto i : c10::irange(wt_numel)) {
-      os << static_cast<char>(inp_data[i] + 128);
+    if (caffe2) {
+      // Create caffe2::Int8GivenTensorFill node
+      std::ostringstream os;
+      for (const auto i : c10::irange(wt_numel)) {
+        os << static_cast<char>(inp_data[i] + 128);
+      }
+      Node* c2_weight = CreateQuantizedWeightsCaffe2(
+          os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp);
+      graph->setInsertPoint(qlinear_node);
+      c2_weight->insertBefore(qlinear_node);
+      qlinear_node->insertInput(1, c2_weight->output());
+    } else {
+      std::vector<float> unpacked_weight_values;
+      unpacked_weight_values.reserve(unpacked_weight.numel());
+      auto unpacked_weight_data =
+          reinterpret_cast<int8_t*>(unpacked_weight.data_ptr<c10::qint8>());
+      for (const auto i : c10::irange(unpacked_weight.numel())) {
+        unpacked_weight_values.push_back(
+            static_cast<float>(unpacked_weight_data[i]));
+      }
+      std::vector<Node*> c2_weight = CreateQuantizedWeights(
+          unpacked_weight_values,
+          graph,
+          wt_sizes,
+          static_cast<float>(unpacked_weight.q_scale()),
+          weight_zp);
+      graph->setInsertPoint(qlinear_node);
+      c2_weight[0]->insertBefore(qlinear_node);
+      qlinear_node->insertInput(1, c2_weight[0]->output());
+      c2_weight[1]->insertBefore(qlinear_node);
+      qlinear_node->insertInput(2, c2_weight[1]->output());
+      c2_weight[2]->insertBefore(qlinear_node);
+      qlinear_node->insertInput(3, c2_weight[2]->output());
    }

-    Node* c2_weight = CreateQuantizedWeights(
-        os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp);
-    graph->setInsertPoint(qlinear_node);
-    c2_weight->insertBefore(qlinear_node);
-    qlinear_node->insertInput(1, c2_weight->output());
-
    // Add bias
    at::Tensor original_bias;
    if (bias.has_value()) {
      original_bias = bias.value();
      original_bias.set_requires_grad(false);
    } else {
-      // Caffe2 ops always expect bias tensor so if not present create empty
-      // tensor.
      int64_t bias_size = unpacked_weight.size(0);
      original_bias =
          at::zeros(bias_size, unpacked_weight.options().dtype(at::kFloat));
@ -402,24 +477,41 @@ void unpackQuantizedWeightsHelper(
        input_val->type()->str());

    auto input_node = match_vmap.at(vmap.at("r"))->node()->inputs()[0]->node();
-    auto input_scale = getScaleFromInput(input_node);
-    auto q_bias = at::quantize_per_tensor(
-        original_bias, weight_scale * input_scale, 0, at::kQInt32);
+    at::Tensor q_bias;

-    std::vector<int64_t> bias_values;
-    bias_values.reserve(q_bias.numel());
-    auto bias_data = (int32_t*)q_bias.data_ptr<c10::qint32>();
-    for (const auto i : c10::irange(q_bias.numel())) {
-      bias_values.push_back(bias_data[i]);
+    if (caffe2) {
+      auto input_scale = getScaleFromInput(input_node);
+      q_bias = at::quantize_per_tensor(
+          original_bias, weight_scale * input_scale, 0, at::kQInt32);
+      std::vector<int64_t> bias_values;
+      bias_values.reserve(q_bias.numel());
+      auto bias_data = (int32_t*)q_bias.data_ptr<c10::qint32>();
+      for (const auto i : c10::irange(q_bias.numel())) {
+        bias_values.push_back(bias_data[i]);
+      }
+      Node* c2_bias = CreateQuantizedBiasCaffe2(
+          bias_values,
+          graph,
+          q_bias.sizes().vec(),
+          q_bias.q_scale(),
+          q_bias.q_zero_point());
+      c2_bias->insertBefore(qlinear_node);
+      qlinear_node->insertInput(2, c2_bias->output());
+    } else {
+      std::vector<float> bias_values(original_bias.numel());
+      auto bias_data = original_bias.data_ptr<float>();
+      for (const auto i : c10::irange(original_bias.numel())) {
+        bias_values[i] = bias_data[i];
+      }
+      Node* bias =
+          CreateQuantizedBias(bias_values, graph, original_bias.sizes().vec());
+      bias->insertBefore(qlinear_node);
+      // For quantized_linear inputs, the order is input, weight, bias, ....
+      // We unpack weight into 3 values. then it is
+      // input, weight_value, weight_scale, weight_zero_point, bias, ...
+      // Therefore bias is at location 4.
+      qlinear_node->insertInput(4, bias->output());
    }
-    Node* c2_bias = CreateQuantizedBias(
-        bias_values,
-        graph,
-        q_bias.sizes().vec(),
-        q_bias.q_scale(),
-        q_bias.q_zero_point());
-    c2_bias->insertBefore(qlinear_node);
-    qlinear_node->insertInput(2, c2_bias->output());

    // add conv arguments: stride, padding, dilation, groups
    if (stride.has_value() && padding.has_value() && dilation.has_value() &&
@ -444,9 +536,59 @@ void unpackQuantizedWeightsHelper(
    eraseUnusedValuesFromMap(valsToParamsMap);
  }
 }
+
+static std::
+    unordered_map<c10::ScalarType, c10::ScalarType, ScalarTypeHashFunction>
+        qTypeToValType = {
+            {c10::ScalarType::QInt8, c10::ScalarType::Char},
+            {c10::ScalarType::QUInt8, c10::ScalarType::Byte},
+            {c10::ScalarType::QInt32, c10::ScalarType::Int},
+            {c10::ScalarType::QUInt4x2, c10::ScalarType::Byte},
+};
+
+// Unpack quantized tensor inputs into {value, scale, zero_point},
+// Then create a prim::TupleConstruct node based on these three values.
+void UnpackQuantizedTensorInputs(std::shared_ptr<Graph>& graph) {
+  for (size_t index = 0; index < graph->inputs().size();) {
+    auto g_input = graph->inputs()[index];
+    TensorTypePtr shape_type = g_input->type()->cast<TensorType>();
+    if (!shape_type || !shape_type->scalarType().has_value()) {
+      index++;
+      continue;
+    }
+    auto scalar_type = shape_type->scalarType().value();
+    if (qTypeToValType.find(scalar_type) == qTypeToValType.end()) {
+      index++;
+      continue;
+    }
+    std::string input_name = g_input->debugName();
+    auto input_value =
+        graph->insertInput(index, input_name + "_value")
+            ->setType(shape_type->withScalarType(qTypeToValType[scalar_type]));
+    // scale and zero_point type can be found at torch/include/ATen/Operators.h
+    auto input_scale =
+        graph->insertInput(index + 1, input_name + "_scale")
+            ->setType(TensorType::create(
+                at::kDouble, at::kCPU, 0, /*requires_grad=*/c10::nullopt));
+    auto input_zero_point =
+        graph->insertInput(index + 2, input_name + "_zero_point")
+            ->setType(TensorType::create(
+                at::kLong, at::kCPU, 0, /*requires_grad=*/c10::nullopt));
+    std::vector<Value*> converted{input_value, input_scale, input_zero_point};
+    auto input_tuple =
+        graph->prependNode(graph->createTuple(converted))->output();
+    g_input->replaceAllUsesWith(input_tuple);
+    // Erase the original quantized tensor input.
+    graph->eraseInput(index + converted.size());
+    index += 3;
+  }
+}
+
+// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
 void UnpackQuantizedWeights(
    std::shared_ptr<Graph>& graph,
-    std::map<std::string, IValue>& paramsDict) {
+    std::map<std::string, IValue>& paramsDict,
+    bool caffe2) {
  std::string qlinear = R"(
  graph(%input, %packed_weight, %w_scale, %w_zero_point):
        %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point)
@ -472,31 +614,36 @@ void UnpackQuantizedWeights(
      paramsDict,
      qlinear,
      "quantized::linear_unpack",
-      QuantizedParamsType::LINEAR);
-  unpackQuantizedWeightsHelper(
-      graph,
-      paramsDict,
-      qconv2d,
-      "quantized::conv2d_unpack",
-      QuantizedParamsType::CONV);
-  unpackQuantizedWeightsHelper(
-      graph,
-      paramsDict,
-      qconv2d_relu,
-      "quantized::conv2d_unpack",
-      QuantizedParamsType::CONV);
-  unpackQuantizedWeightsHelper(
-      graph,
-      paramsDict,
-      qconv3d,
-      "quantized::conv3d_unpack",
-      QuantizedParamsType::CONV);
-  unpackQuantizedWeightsHelper(
-      graph,
-      paramsDict,
-      qconv3d_relu,
-      "quantized::conv3d_unpack",
-      QuantizedParamsType::CONV);
+      QuantizedParamsType::LINEAR,
+      caffe2);
+  if (caffe2) {
+    unpackQuantizedWeightsHelper(
+        graph,
+        paramsDict,
+        qconv2d,
+        "quantized::conv2d_unpack",
+        QuantizedParamsType::CONV);
+    unpackQuantizedWeightsHelper(
+        graph,
+        paramsDict,
+        qconv2d_relu,
+        "quantized::conv2d_unpack",
+        QuantizedParamsType::CONV);
+    unpackQuantizedWeightsHelper(
+        graph,
+        paramsDict,
+        qconv3d,
+        "quantized::conv3d_unpack",
+        QuantizedParamsType::CONV);
+    unpackQuantizedWeightsHelper(
+        graph,
+        paramsDict,
+        qconv3d_relu,
+        "quantized::conv3d_unpack",
+        QuantizedParamsType::CONV);
+  } else {
+    UnpackQuantizedTensorInputs(graph);
+  }
  GRAPH_DUMP("After UnpackQuantizedWeights: ", graph);
 }

--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
@ -2,6 +2,7 @@

 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>

 #include <memory>

@ -10,7 +11,8 @@ namespace jit {

 TORCH_API void UnpackQuantizedWeights(
    std::shared_ptr<Graph>& graph,
-    std::map<std::string, IValue>& paramsDict);
+    std::map<std::string, IValue>& paramsDict,
+    bool caffe2);
 TORCH_API void insertPermutes(
    std::shared_ptr<Graph>& graph,
    std::map<std::string, IValue>& paramsDict);
--- a/torch/csrc/jit/runtime/static/fusion.cpp
+++ b/torch/csrc/jit/runtime/static/fusion.cpp
@ -11,6 +11,7 @@
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
 #include <torch/csrc/jit/runtime/jit_trace.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/ops.h>
@ -322,6 +323,17 @@ void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size) {
  inlineSmallFusionGroups(block, min_size);
 }

+void inlineFallbackGraphs(std::shared_ptr<Graph> graph) {
+  DepthFirstGraphNodeIterator it(graph);
+
+  Node* n = nullptr;
+  while ((n = it.next()) != nullptr) {
+    if (n->kind() == prim::FallbackGraph) {
+      SubgraphUtils::unmergeSubgraph(n);
+    }
+  }
+}
+
 void performTensorExprFusion(
    std::shared_ptr<Graph> graph,
    std::vector<IValue> sample_inputs) {
@ -335,6 +347,7 @@ void performTensorExprFusion(
      /*min_group_size*/ 2,
      /*add_composed_op*/ false,
      /*fuse_to_dynamic_shapes*/ true);
+  inlineFallbackGraphs(traced_graph);
  graph->block()->clear();
  graph->block()->cloneFrom(traced_graph->block(), nullptr);
  GRAPH_DUMP("Graph after fusion: ", graph);
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@ -157,10 +157,10 @@ void OptimizeGraph(
    // TODO: we can avoid this guard by moving operations
    // to exposed folders.
 #ifdef FBCODE_CAFFE2
-    if (opts.use_copy_variants) {
+    if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) {
      ReplaceWithCopy(graph);
    }
-    if (opts.use_maybe_copy_variants) {
+    if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) {
      ReplaceWithMaybeCopy(graph);
    }
    FuseListUnpack(graph);
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@ -166,11 +166,18 @@ struct TORCH_API StaticModuleOptions {
  bool manage_output_tensors{false};
  // Gates the ReplaceWithCopy pass, which replaces ops that
  // sometimes alias their outputs with out variants that
-  // always copy (so the output may participate in memory planning)
+  // always copy (so the output may participate in memory planning).
+  // Since replacing with copies is done after TensorExpr fusion, the
+  // resulting graph does not conform to the assumptions made in the fuser.
+  // So, even if this flag is turned on, the ReplaceWithCopy pass will not
+  // be executed if TensorExpr fusion is enabled.
  bool use_copy_variants{true};
  // Gates the ReplaceWithMaybeCopy pass, which replaces ops that
  // sometimes alias their outputs with subgraphs that include an out
  // variant.
+  // For the same reason as `use_copy_variants`, the ReplaceWithMaybeCopy pass
+  // will not be executed if TensorExpr fusion is enabled, even if this flag
+  // is turned on.
  bool use_maybe_copy_variants{true};
  // enable TensorExpr fusion of ops at model loading time
  bool enable_tensorexpr_fusion{false};
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@ -21,6 +21,7 @@
 #include <atomic>

 #include <onnx/checker.h>
+#include <onnx/shape_inference/implementation.h>
 #include <onnx/onnx_pb.h>
 #include <onnx/proto_utils.h>

@ -1248,13 +1249,18 @@ std::string serialize_model_proto_to_string(
  return model_proto->SerializeAsString();
 }

-void check_onnx_proto(const std::string& proto_string) {
+void check_onnx_proto(const std::string& proto_string, bool full_check) {
  onnx::ModelProto model;
  if (!ParseProtoFromBytes(&model, proto_string.c_str(), proto_string.size())) {
    throw std::runtime_error("Invalid ONNX proto string.");
    return;
  }
  onnx::checker::check_model(model);
+
+  if (full_check) {
+    onnx::shape_inference::InferShapes(model);
+  }
+
 }

 } // namespace jit
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@ -61,7 +61,7 @@ export_onnx(
 TORCH_API std::string serialize_model_proto_to_string(
    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);

-TORCH_API void check_onnx_proto(const std::string& proto_string);
+TORCH_API void check_onnx_proto(const std::string& proto_string, bool full_check=false);

 // Serializer for both oldsyle and unified format TorchScript serialization
 class TORCH_API ScriptModuleSerializer {
@ -85,9 +85,6 @@ class TORCH_API ScriptModuleSerializer {
  void convertNamedType(const c10::NamedTypePtr& class_type);
  void convertTypes(const at::NamedTypePtr& root_type);
  void writeExtraFiles(const Module& module, const ExtraFilesMap& extra_files);
-  void writeMobileMetadata(
-      const Module& module,
-      const ExtraFilesMap& extra_files);
  void writeByteCode(const Module& module, bool save_mobile_debug_info);
  void writeArchive(
      const IValue& value,
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@ -946,6 +946,10 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
      VLOG(3) << "Executing IR graph hash " << HashToString(hash)
              << " on device " << async->device << " done!";

+      TORCH_CHECK(async->tensors_data.size() == results.size(),
+        "Expected number of outputs does not match TorchScript Stack size: ",
+        async->tensors_data.size(), " != ", results.size());
+
      for (const auto i : c10::irange(results.size())) {
        if (async->tensors_data[i] != nullptr) {
          async->tensors_data[i]->Assign(*results[i]);
--- a/torch/csrc/lazy/core/tensor_impl.cpp
+++ b/torch/csrc/lazy/core/tensor_impl.cpp
@ -3,6 +3,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/tensor_util.h>

 namespace torch {
@ -144,7 +145,7 @@ void LTCTensorImpl::setup_size_properties() {
    // We can't call empty_tensor_restride(c10::MemoryFormat::Contiguous) given we override sizes() too.
    std::vector<int64_t> updated_strides;
    updated_strides = ComputeArrayStrides(shape.Get().sizes());
-    for (int i = 0; i < updated_strides.size(); i++) {
+    for (const auto i : c10::irange(updated_strides.size())) {
      sizes_and_strides_.stride_at_unchecked(i) = updated_strides[i];
    }
    generation_ = generation;
--- a/torch/csrc/lazy/core/view_ops/squeeze.cpp
+++ b/torch/csrc/lazy/core/view_ops/squeeze.cpp
@ -1,3 +1,4 @@
+#include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/view_ops/squeeze.h>
 #include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>

@ -9,7 +10,7 @@ namespace lazy {
 std::vector<int64_t> BuildSqueezedDimensions(c10::ArrayRef<int64_t> dimensions,
                                             int64_t squeeze_dim) {
  std::vector<int64_t> output_dimensions;
-  for (int64_t i = 0; i < dimensions.size(); ++i) {
+  for (const auto i : c10::irange(dimensions.size())) {
    int64_t dim = dimensions[i];
    if (dim != 1 || (i != squeeze_dim && squeeze_dim >= 0)) {
      output_dimensions.push_back(dim);
--- a/Show More
+++ b/Show More