Update on "fix torch::deploy OSS CI: previous the tests for torch::deploy will be skipped"

We match build environment with linux-xenial-cuda11.3 in .jenkins/pytorch/build.sh but match against linux-xenial-cuda11.1 in .jenkins/pytorch/test.sh. I suspect this mismatch may cause the torch::deploy tests not being run. Send this PR out and I will verify in CI. Differential Revision: [D31939631](https://our.internmc.facebook.com/intern/diff/D31939631) [ghstack-poisoned]
2025-12-06 12:20:52 +01:00 · 2021-10-29 15:04:12 -07:00 · 2021-10-29 15:04:12 -07:00 · f3e739463d
commit f3e739463d
parent 9c10c69f79 ddaaca5188
431 changed files with 11706 additions and 3391 deletions
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@ -4,11 +4,8 @@ from cimodel.lib.miniutils import quote
 from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN


-# TODO: make this generated from a matrix rather than just a static list
+# NOTE: All hardcoded docker image builds have been migrated to GHA
 IMAGE_NAMES = [
-    "pytorch-linux-bionic-rocm4.1-py3.6",
-    "pytorch-linux-bionic-rocm4.2-py3.6",
-    "pytorch-linux-bionic-rocm4.3.1-py3.6",
 ]

 # This entry should be an element from the list above
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -6506,15 +6506,6 @@ workflows:
    when: << pipeline.parameters.run_binary_tests >>
  build:
    jobs:
-      - docker_build_job:
-          name: "docker-pytorch-linux-bionic-rocm4.1-py3.6"
-          image_name: "pytorch-linux-bionic-rocm4.1-py3.6"
-      - docker_build_job:
-          name: "docker-pytorch-linux-bionic-rocm4.2-py3.6"
-          image_name: "pytorch-linux-bionic-rocm4.2-py3.6"
-      - docker_build_job:
-          name: "docker-pytorch-linux-bionic-rocm4.3.1-py3.6"
-          image_name: "pytorch-linux-bionic-rocm4.3.1-py3.6"
      - pytorch_linux_build:
          name: pytorch_linux_xenial_py3_6_gcc5_4_build
          requires:
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@ -140,8 +140,6 @@ def generate_required_docker_images(items):

 def gen_build_workflows_tree():
    build_workflows_functions = [
-        # For rocm images, which don't have a circleci job equivalent
-        cimodel.data.simple.docker_definitions.get_workflow_jobs,
        pytorch_build_definitions.get_workflow_jobs,
        cimodel.data.simple.macos_definitions.get_workflow_jobs,
        cimodel.data.simple.android_definitions.get_workflow_jobs,
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@ -9,7 +9,6 @@
      "linux-bionic-cuda10.2-py3.9-gcc7",
      "linux-bionic-py3.6-clang9",
      "linux-vulkan-bionic-py3.6-clang9",
-      "linux-xenial-cuda10.2-py3.6-gcc7",
      "linux-xenial-cuda11.3-py3.6-gcc7",
      "linux-xenial-py3-clang5-mobile-build",
      "linux-xenial-py3-clang5-mobile-code-analysis",
@ -47,7 +46,6 @@
      "libtorch-linux-xenial-cuda10.2-py3.6-gcc7",
      "libtorch-linux-xenial-cuda11.3-py3.6-gcc7",
      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "linux-xenial-cuda10.2-py3.6-gcc7",
      "linux-xenial-cuda11.3-py3.6-gcc7",
      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck",
@ -82,7 +80,6 @@
      "linux-bionic-cuda10.2-py3.9-gcc7",
      "linux-bionic-py3.6-clang9",
      "linux-vulkan-bionic-py3.6-clang9",
-      "linux-xenial-cuda10.2-py3.6-gcc7",
      "linux-xenial-cuda11.3-py3.6-gcc7",
      "linux-xenial-py3-clang5-mobile-build",
      "linux-xenial-py3-clang5-mobile-code-analysis",
@ -121,7 +118,6 @@
    ],
    "ciflow/slow": [
      "linux-bionic-cuda10.2-py3.9-gcc7",
-      "linux-xenial-cuda10.2-py3.6-gcc7",
      "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck"
    ],
    "ciflow/slow-gradcheck": [
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -15,6 +15,7 @@
 #     os: linux
 #     max_available: 20
 #     disk_size: 50
+#     is_ephemeral: true

 runner_types:
  linux.2xlarge:
@ -27,16 +28,19 @@ runner_types:
    os: linux
    max_available: 125
    disk_size: 150
+    is_ephemeral: false
  linux.4xlarge.nvidia.gpu:
    instance_type: g3.4xlarge
    os: linux
    max_available: 125
    disk_size: 150
+    is_ephemeral: false
  linux.16xlarge.nvidia.gpu:
    instance_type: g3.16xlarge
    os: linux
    max_available: 10
    disk_size: 150
+    is_ephemeral: false
  windows.4xlarge:
    instance_type: c5d.4xlarge
    os: windows
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -17,6 +17,7 @@ DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
 GITHUB_DIR = Path(__file__).resolve().parent.parent

 WINDOWS_CPU_TEST_RUNNER = "windows.4xlarge"
+# contains 1 gpu
 WINDOWS_CUDA_TEST_RUNNER = "windows.8xlarge.nvidia.gpu"
 WINDOWS_RUNNERS = {
    WINDOWS_CPU_TEST_RUNNER,
@ -24,7 +25,8 @@ WINDOWS_RUNNERS = {
 }

 LINUX_CPU_TEST_RUNNER = "linux.2xlarge"
-LINUX_CUDA_TEST_RUNNER = "linux.8xlarge.nvidia.gpu"
+# contains 1 gpu
+LINUX_CUDA_TEST_RUNNER = "linux.4xlarge.nvidia.gpu"
 LINUX_RUNNERS = {
    LINUX_CPU_TEST_RUNNER,
    LINUX_CUDA_TEST_RUNNER,
@ -411,17 +413,6 @@ LINUX_WORKFLOWS = [
        build_environment="linux-bionic-cuda10.2-py3.9-gcc7",
        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
        test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        num_test_shards=2,
-        ciflow_config=CIFlowConfig(
-            run_on_canary=True,
-            labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
-        ),
-    ),
-    CIWorkflow(
-        arch="linux",
-        build_environment="linux-xenial-cuda10.2-py3.6-gcc7",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-        test_runner_type=LINUX_CUDA_TEST_RUNNER,
        enable_jit_legacy_test=1,
        enable_multigpu_test=1,
        enable_nogpu_no_avx_test=1,
@ -429,7 +420,8 @@ LINUX_WORKFLOWS = [
        enable_slow_test=1,
        num_test_shards=2,
        ciflow_config=CIFlowConfig(
-            labels=set([LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]),
+            run_on_canary=True,
+            labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA}
        ),
    ),
    CIWorkflow(
@ -539,14 +531,23 @@ BAZEL_WORKFLOWS = [
    ),
 ]

-DOCKER_WORKFLOWS = [
-    DockerWorkflow(
-        build_environment="docker-builds",
-        docker_images=sorted({
+DOCKER_IMAGES = {
+    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",  # for pytorch/xla
+    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.1-py3.6",                 # for rocm
+    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.2-py3.6",                 # for rocm
+    f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.6",               # for rocm
+}
+
+DOCKER_IMAGES.update({
    workflow.docker_image_base
    for workflow in [*LINUX_WORKFLOWS, *BAZEL_WORKFLOWS]
    if workflow.docker_image_base
-        }),
+})
+
+DOCKER_WORKFLOWS = [
+    DockerWorkflow(
+        build_environment="docker-builds",
+        docker_images=sorted(DOCKER_IMAGES),
        # Run weekly to ensure they can build
        is_scheduled="1 * */7 * *",
    ),
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@ -16,7 +16,7 @@ from typing_extensions import TypedDict


 BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT')
-
+assert BUILD_ENVIRONMENT is not None

 class Config(TypedDict):
    num_shards: int
@ -70,6 +70,7 @@ def main() -> None:
    if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST:
        NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST)
    MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE')
+    DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE)
    NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE')
    configs: Dict[str, Config] = {}
    if os.getenv('ENABLE_JIT_LEGACY_TEST'):
@ -84,7 +85,10 @@ def main() -> None:
        if os.getenv('ENABLE_FORCE_ON_CPU_TEST'):
            configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE}
    if os.getenv('ENABLE_DISTRIBUTED_TEST'):
-        configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+        configs['distributed'] = {
+            'num_shards': 1,
+            'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE
+        }
    if os.getenv('ENABLE_SLOW_TEST'):
        configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
    if os.getenv('ENABLE_DOCS_TEST'):
--- a/.github/templates/bazel_ci_workflow.yml.j2
+++ b/.github/templates/bazel_ci_workflow.yml.j2
@ -21,7 +21,7 @@ on:
    steps:
      !{{ common.setup_ec2_linux() }}
      !{{ common.checkout_pytorch("recursive") }}
-      !{{ common.calculate_docker_image() }}
+      !{{ common.calculate_docker_image(false) }}
      - name: Pull Docker image
        run: |
          !{{ common.pull_docker("${DOCKER_IMAGE}") }}
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -183,7 +183,7 @@ concurrency:
        run: echo "${LABELS}"
 {%- endmacro -%}

-{%- macro calculate_docker_image() -%}
+{%- macro calculate_docker_image(always_rebuild) -%}
      - name: Calculate docker image tag
        id: calculate-tag
        run: |
@ -198,6 +198,7 @@ concurrency:
          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
          set -x
+{%- if not always_rebuild %}
          # Check if image already exists, if it does then skip building it
          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
            exit 0
@ -222,6 +223,7 @@ concurrency:
            echo "       contact the PyTorch team to restore the original images"
            exit 1
          fi
+{%- endif %}
          echo ::set-output name=rebuild::yes
      - name: Build and push docker image
        if: ${{ steps.check.outputs.rebuild }}
--- a/.github/templates/docker_builds_ci_workflow.yml.j2
+++ b/.github/templates/docker_builds_ci_workflow.yml.j2
@ -7,6 +7,7 @@ name: !{{ build_environment }}
 {%- endblock %}

 on:
+  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
@ -35,7 +36,7 @@ jobs:
    steps:
      !{{ common.setup_ec2_linux() }}
      !{{ common.checkout_pytorch("recursive") }}
-      !{{ common.calculate_docker_image() }}
+      !{{ common.calculate_docker_image(true) }}
      - name: Pull Docker image
        run: |
          !{{ common.pull_docker("${DOCKER_IMAGE}") }}
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@ -54,7 +54,7 @@ jobs:
    steps:
      !{{ common.setup_ec2_linux() }}
      !{{ common.checkout_pytorch("recursive") }}
-      !{{ common.calculate_docker_image() }}
+      !{{ common.calculate_docker_image(false) }}
      - name: Pull Docker image
        run: |
          !{{ common.pull_docker("${DOCKER_IMAGE}") }}
@ -152,6 +152,7 @@ jobs:
      ENABLE_NOARCH_TEST: !{{ enable_noarch_test }}
      NUM_TEST_SHARDS: !{{ num_test_shards }}
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-docker-builds.yml
+++ b/.github/workflows/generated-docker-builds.yml
@ -4,6 +4,7 @@
 name: docker-builds

 on:
+  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, reopened]
    paths:
@ -22,10 +23,18 @@ jobs:
    strategy:
      matrix:
        include:
+            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9'
+              docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9'
            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
              docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7'
            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.6-clang9'
              docker_image_short_name: 'pytorch-linux-bionic-py3.6-clang9'
+            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.1-py3.6'
+              docker_image_short_name: 'pytorch-linux-bionic-rocm4.1-py3.6'
+            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.2-py3.6'
+              docker_image_short_name: 'pytorch-linux-bionic-rocm4.2-py3.6'
+            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.6'
+              docker_image_short_name: 'pytorch-linux-bionic-rocm4.3.1-py3.6'
            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
              docker_image_short_name: 'pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7'
            - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7'
@ -110,30 +119,6 @@ jobs:
          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
          echo ::set-output name=rebuild::yes
      - name: Build and push docker image
        if: ${{ steps.check.outputs.rebuild }}
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@ -260,19 +260,20 @@ jobs:
    runs-on: ubuntu-18.04
    needs: [ciflow_should_run]
    env:
-      TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: ''
-      ENABLE_MULTIGPU_TEST: ''
-      ENABLE_NOGPU_NO_AVX_TEST: ''
-      ENABLE_NOGPU_NO_AVX2_TEST: ''
-      ENABLE_SLOW_TEST: ''
+      ENABLE_JIT_LEGACY_TEST: 1
+      ENABLE_MULTIGPU_TEST: 1
+      ENABLE_NOGPU_NO_AVX_TEST: 1
+      ENABLE_NOGPU_NO_AVX2_TEST: 1
+      ENABLE_SLOW_TEST: 1
      ENABLE_DOCS_TEST: ''
      ENABLE_BACKWARDS_COMPAT_TEST: ''
      ENABLE_XLA_TEST: ''
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-bionic-py3.6-clang9.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: 1
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
+++ b/.github/workflows/generated-linux-vulkan-bionic-py3.6-clang9.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 1
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@ -1,517 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/linux_ci_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-xenial-cuda10.2-py3.6-gcc7
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened, unassigned]
-  push:
-    branches:
-      - master
-      - release/*
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-  TORCH_CUDA_ARCH_LIST: 5.2
-  IN_CI: 1
-  IS_GHA: 1
-  # This is used for the phase of adding wheel tests only, will be removed once completed
-  IN_WHEEL_TEST: 1
-  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
-  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  AWS_DEFAULT_REGION: us-east-1
-  CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-  CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-concurrency:
-  group: linux-xenial-cuda10.2-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-
-  ciflow_should_run:
-    runs-on: ubuntu-18.04
-    env:
-      IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
-      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow') }}
-      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-    if: ${{ (github.repository == 'pytorch/pytorch') && (
-            (github.event_name == 'push') ||
-            (github.event_name == 'schedule') ||
-            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cuda') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') || contains(github.event.pull_request.labels.*.name, 'ciflow/slow')) ||
-            (false))
-         }}
-    steps:
-      - name: noop
-        run: echo running ciflow_should_run
-      - name: print labels
-        run: echo "${LABELS}"
-
-  build:
-    runs-on: linux.2xlarge
-    needs: [ciflow_should_run]
-    env:
-      JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-build
-    outputs:
-      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        env:
-          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Calculate docker image tag
-        id: calculate-tag
-        run: |
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
-          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
-          echo "::set-output name=docker_tag::${DOCKER_TAG}"
-          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
-      - name: Check if image should be built
-        id: check
-        env:
-          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
-        run: |
-          set -x
-          # Check if image already exists, if it does then skip building it
-          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
-            exit 0
-          fi
-          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
-            # if we're on the base branch then use the parent commit
-            MERGE_BASE=$(git rev-parse HEAD~)
-          else
-            # otherwise we're on a PR, so use the most recent base commit
-            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
-          fi
-          # Covers the case where a previous tag doesn't exist for the tree
-          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
-            exit 1
-          fi
-          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
-          # If no image exists but the hash is the same as the previous hash then we should error out here
-          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
-            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
-            echo "       contact the PyTorch team to restore the original images"
-            exit 1
-          fi
-          echo ::set-output name=rebuild::yes
-      - name: Build and push docker image
-        if: ${{ steps.check.outputs.rebuild }}
-        env:
-          DOCKER_SKIP_S3_UPLOAD: 1
-        working-directory: .circleci/docker
-        run: |
-          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
-          ./build_docker.sh
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Build
-        env:
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        run: |
-          # detached container should get cleaned up by teardown_ec2_linux
-          container_name=$(docker run \
-            -e BUILD_ENVIRONMENT \
-            -e JOB_BASE_NAME \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e AWS_DEFAULT_REGION \
-            -e IS_GHA \
-            -e CIRCLE_PR_NUMBER \
-            -e CIRCLE_SHA1 \
-            -e CIRCLE_BRANCH \
-            -e GITHUB_RUN_ID \
-            -e SCCACHE_BUCKET \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e SKIP_SCCACHE_INITIALIZATION=1 \
-            -e TORCH_CUDA_ARCH_LIST \
-            -e PR_LABELS \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --tty \
-            --detach \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
-      - name: Display and upload binary build size statistics (Click Me)
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        run: |
-          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
-          export COMMIT_TIME
-          pip3 install requests==2.26 boto3==1.16.34
-          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
-      - name: Chown workspace
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Archive artifacts into zip
-        run: |
-          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store PyTorch Build Artifacts on S3
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            artifacts.zip
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        env:
-          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Clean up docker images
-        if: always()
-        run: |
-          # Prune all of the docker images
-          docker system prune -af
-
-  generate-test-matrix:
-    runs-on: ubuntu-18.04
-    needs: [ciflow_should_run]
-    env:
-      TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
-      ENABLE_DISTRIBUTED_TEST: 1
-      ENABLE_JIT_LEGACY_TEST: 1
-      ENABLE_MULTIGPU_TEST: 1
-      ENABLE_NOGPU_NO_AVX_TEST: 1
-      ENABLE_NOGPU_NO_AVX2_TEST: 1
-      ENABLE_SLOW_TEST: 1
-      ENABLE_DOCS_TEST: ''
-      ENABLE_BACKWARDS_COMPAT_TEST: ''
-      ENABLE_XLA_TEST: ''
-      ENABLE_NOARCH_TEST: ''
-      NUM_TEST_SHARDS: 2
-      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
-      NOGPU_RUNNER_TYPE: linux.2xlarge
-      PR_BODY: ${{ github.event.pull_request.body }}
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
-      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
-    container:
-      image: python:3.9
-    steps:
-      - name: Install dependencies
-        run: pip install typing-extensions==3.10
-      - name: Clone pytorch/pytorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-      - name: Generating test matrix
-        id: set-matrix
-        run: .github/scripts/generate_pytorch_test_matrix.py
-
-  test:
-    needs: [build, generate-test-matrix, ciflow_should_run]
-    strategy:
-      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    env:
-      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
-      JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-test
-      TEST_CONFIG: ${{ matrix.config }}
-      SHARD_NUMBER: ${{ matrix.shard }}
-      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-      - name: Log in to ECR
-        env:
-          AWS_RETRY_MODE: standard
-          AWS_MAX_ATTEMPTS: 5
-        run: |
-          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
-          bash /tmp/ecr-login.sh
-          rm /tmp/ecr-login.sh
-      - name: Chown workspace
-        env:
-          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${ALPINE_IMAGE}"
-          # Ensure the working directory gets chowned back to the current user
-          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Clean workspace
-        run: |
-          rm -rf "${GITHUB_WORKSPACE:?}/*"
-          rm -f ~/.ssh/authorized_keys
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      - name: Preserve github env variables for use in docker
-        run: |
-          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          # deep clone, to allow use of git merge-base
-          fetch-depth: 0
-          submodules: recursive
-      - name: Pull Docker image
-        run: |
-          retry () {
-              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
-          }
-          retry docker pull "${DOCKER_IMAGE}"
-      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
-        run: |
-          bash .github/scripts/install_nvidia_utils_linux.sh
-          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-      - name: Determine shm-size
-        run: |
-          shm_size="1g"
-          case "${BUILD_ENVIRONMENT}" in
-            *cuda*)
-              shm_size="2g"
-              ;;
-            *rocm*)
-              shm_size="8g"
-              ;;
-          esac
-          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
-      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
-        name: Download PyTorch Build Artifacts
-        with:
-          name: ${{ env.BUILD_ENVIRONMENT }}
-      - name: Unzip artifacts
-        run: |
-          unzip -o artifacts.zip
-      - name: Output disk space left
-        run: |
-          sudo df -H
-      - name: Parse ref
-        id: parse-ref
-        run: .github/scripts/parse_ref.py
-      - name: Test
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # Time out the test phase after 240 minutes
-        timeout-minutes: 240
-        run: |
-          set -x
-
-          if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
-          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
-          else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
-          fi
-          # detached container should get cleaned up by teardown_ec2_linux
-          # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
-            -e GITHUB_ACTIONS \
-            -e IN_CI \
-            -e IS_GHA \
-            -e CIRCLE_BRANCH \
-            -e CIRCLE_SHA1 \
-            -e CIRCLE_PR_NUMBER \
-            -e AWS_DEFAULT_REGION \
-            -e IN_WHEEL_TEST \
-            -e SHARD_NUMBER \
-            -e JOB_BASE_NAME \
-            -e TEST_CONFIG \
-            -e NUM_TEST_SHARDS \
-            -e PYTORCH_IGNORE_DISABLED_ISSUES \
-            -e PR_LABELS \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
-            -e SCCACHE_BUCKET \
-            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
-            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="${SHM_SIZE}" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
-      - name: Chown workspace
-        if: always()
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Install render_test_results dependencies
-        if: always()
-        shell: bash
-        run: |
-          python3 -m pip install junitparser==2.1.1 rich==10.9.0
-      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
-        if: always()
-        shell: bash
-        # Encoding is weird on windows, just try to default to utf-8 if possible
-        env:
-          PYTHONIOENCODING: "utf-8"
-        run: |
-          python3 tools/render_junit.py test/
-      - name: Zip test reports for upload
-        if: always()
-        env:
-          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
-        run: |
-          # Remove any previous test reports if they exist
-          rm -f test-reports-*.zip
-          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
-      - uses: seemethere/upload-artifact-s3@v3
-        name: Store Test Reports on S3
-        if: always()
-        with:
-          retention-days: 14
-          if-no-files-found: error
-          path:
-            test-reports-*.zip
-      - name: Display and upload test statistics (Click Me)
-        if: always()
-        # temporary hack: set CIRCLE_* vars, until we update
-        # tools/stats/print_test_stats.py to natively support GitHub Actions
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          JOB_BASE_NAME: linux-xenial-cuda10.2-py3.6-gcc7-test
-          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
-          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
-          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
-        shell: bash
-        run: |
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.16.34
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-      - name: Hold runner for 2 hours or until ssh sessions have drained
-        # Always hold for active ssh sessions
-        if: always()
-        run: .github/scripts/wait_for_ssh_to_drain.sh
-      - name: Chown workspace
-        if: always()
-        env:
-          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-        run: |
-          # Ensure the working directory gets chowned back to the current user
-          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - name: Kill containers, clean up images
-        if: always()
-        run: |
-          # ignore expansion of "docker ps -q" since it could be empty
-          # shellcheck disable=SC2046
-          docker stop $(docker ps -q) || true
-          # Prune all of the docker images
-          docker system prune -af
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@ -260,7 +260,7 @@ jobs:
    runs-on: ubuntu-18.04
    needs: [ciflow_should_run]
    env:
-      TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
      ENABLE_DISTRIBUTED_TEST: 1
      ENABLE_JIT_LEGACY_TEST: ''
      ENABLE_MULTIGPU_TEST: ''
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-asan.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-clang7-onnx.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.6-gcc5.4.yml
@ -273,6 +273,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 1
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml
@ -258,7 +258,7 @@ jobs:
    runs-on: ubuntu-18.04
    needs: [ciflow_should_run]
    env:
-      TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
      ENABLE_DISTRIBUTED_TEST: ''
      ENABLE_JIT_LEGACY_TEST: ''
      ENABLE_MULTIGPU_TEST: ''
@ -271,6 +271,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@ -258,7 +258,7 @@ jobs:
    runs-on: ubuntu-18.04
    needs: [ciflow_should_run]
    env:
-      TEST_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
+      TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu
      ENABLE_DISTRIBUTED_TEST: 1
      ENABLE_JIT_LEGACY_TEST: ''
      ENABLE_MULTIGPU_TEST: ''
@ -271,6 +271,7 @@ jobs:
      ENABLE_NOARCH_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
      PR_BODY: ${{ github.event.pull_request.body }}
    outputs:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -105,7 +105,7 @@ jobs:
      - name: Ensure all test files have header containing ownership information
        if: always()
        run: |
-          (! git grep -L "# Owner(s): \[" test/distributed/**/test_*.py) || (printf "The above test files are missing a comment header with ownership information; please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. The owner should be an existing pytorch/pytorch label."; false)
+          (! git grep -L "# Owner(s): \[" ./test/distributed/**/test_*.py || (printf "The above test files are missing a comment header with ownership information; please add the following line\n\n# Owner(s): [\"<owner: label>\"]\n\nto the top of each test file. The owner should be an existing pytorch/pytorch label."; false))

  clang-format:
    runs-on: ubuntu-18.04
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@ -181,9 +181,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # JIT C++ extensions require ninja, so put it into PATH.
  export PATH="/var/lib/jenkins/.local/bin:$PATH"
  if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
-    pip install -q --user flatbuffers==2.0
-    wget https://ortpypackage.blob.core.windows.net/ort-nightly/ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
-    pip install -q --user ort_nightly-1.8.0.dev202107131-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+    pip install -q --user flatbuffers==2.0 onnxruntime==1.9.0
  fi
  "$ROOT_DIR/scripts/onnx/test.sh"
 fi
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -523,7 +523,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then
-  if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.3*-test1* ]]; then
+  if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.3* ]]; then
+    echo "Testing torch::deploy"
    test_torch_deploy
  fi
  test_without_numpy
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -127,6 +127,11 @@ genrule(
        "aten/src/ATen/Declarations.yaml",
        "aten/src/ATen/RegisterBackendSelect.cpp",
        "aten/src/ATen/RegisterCPU.cpp",
+        "aten/src/ATen/RegisterFunctionalization_0.cpp",
+        "aten/src/ATen/RegisterFunctionalization_1.cpp",
+        "aten/src/ATen/RegisterFunctionalization_2.cpp",
+        "aten/src/ATen/RegisterFunctionalization_3.cpp",
+        # "aten/src/ATen/RegisterFunctionalizationEverything.cpp",
        "aten/src/ATen/RegisterMkldnnCPU.cpp",
        "aten/src/ATen/RegisterQuantizedCPU.cpp",
        "aten/src/ATen/RegisterSparseCPU.cpp",
@ -143,6 +148,7 @@ genrule(
        "aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h",
        "aten/src/ATen/CompositeImplicitAutogradFunctions.h",
        "aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h",
+        "aten/src/ATen/FunctionalInverses.h",
        "aten/src/ATen/Functions.h",
        "aten/src/ATen/Functions.cpp",
        "aten/src/ATen/RedispatchFunctions.h",
--- a/1
+++ b/1
@ -11,6 +11,7 @@
 /torch/nn/ @albanD @jbschlosser
 /torch/optim/ @albanD
 /test/test_public_bindings.py @albanD
+/docs/source/conf.py @albanD
 /aten/src/ATen/native/native_functions.yaml @ezyang

 # Tensorpipe RPC Agent.
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@ -1026,6 +1026,8 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
  m.impl("size.int", static_cast<int64_t (*)(const Tensor&, int64_t)>(native::size));
  m.impl("_add_batch_dim", native::_add_batch_dim);
  m.impl("_remove_batch_dim", native::_remove_batch_dim);
+  m.impl("_make_dual", native::_make_dual);
+  m.impl("is_same_size", native::is_same_size);

  m.impl("sum.dim_IntList", sum_batching_rule);
  m.impl("is_complex", native::is_complex);
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -0,0 +1,234 @@
+
+#include <ATen/FunctionalInverses.h>
+
+#include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
+namespace at {
+namespace functionalization {
+
+// This logic is similar to autograd code for view backwards calls.
+// We can't easily share it though, because (eventually) these functions
+// will all call `permute/unsqueeze_copy()` instead of `permute/unsqueeze`.
+
+Tensor permute_inverse(const Tensor& self, IntArrayRef dims) {
+  // invert the permutation
+  auto ndims = dims.size();
+  std::vector<int64_t> dims_(ndims);
+  for(const auto i : c10::irange(ndims)) {
+    dims_[at::maybe_wrap_dim(dims[i], ndims)] = i;
+  }
+  return self.permute(dims_);
+}
+
+Tensor unsqueeze_to(const Tensor & self, IntArrayRef sizes) {
+  auto result = self;
+
+  int64_t nDims = sizes.size();
+  for(const auto dim : c10::irange(nDims)) {
+    if (sizes[dim] == 1) {
+      result = result.unsqueeze(dim);
+    }
+  }
+  return result;
+}
+
+Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) {
+  dim = at::maybe_wrap_dim(dim, sizes.size());
+  // in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided
+  // unsqueezing in the backward.
+  if (sizes.size() > 0 && sizes[dim] == 1) {
+    return self.unsqueeze(dim);
+  }
+  return self;
+}
+
+// Note [Functionalization Pass: View Inverses].
+// This file contains the implementation of each "view inverse".
+// These aren't really true inverses in the mathematically sense: each view inverse describes how to undo
+// the original view (although it takes in different arguments).
+//
+// E.g. Below is an example of a program that has alias operations removed, and the role that view inverses play:
+//
+// normal program with views and mutations:
+// view1 = input1.view_op(args...)
+// view1.add_(1) (perform a mutation on the view, which should also modify input)
+
+// version of the program with no aliasing, that instead uses view_inverse functions:
+// view_copy1 = input1.view_copy_op(args...)
+// view_copy1.add_(1) (perform a mutation on view_copy1. At this point, input1 is NOT modified)
+// x = view_op_inverse(input1, view_copy1, args...)
+//
+// at this point, input1 and x should be equal
+//
+// Note that input1 is also passed as an argument to view_op_inverse in the above example.
+// This isn't actually required for most view operators: it's only required for view ops
+// where you can't figure out what the size of the base tensor is given just the view tensor and arguments.
+// Examples are slice/select/scatter/squeeze/as_strided.
+// We happen to be passing in the base tensor in all cases, mostly to make the codegen simpler.
+// But you'll see below that the "base" argument is ignored by most view_inverse implementations.
+
+// ----------------------------------------------------------
+// Implementations of each view_inverse() function are below.
+// One of these needs to be implemented for every existing non-composite view operator.
+// The codegen automatically generates the corresponding function declaration.
+// ----------------------------------------------------------
+
+Tensor FunctionalInverses::_fw_primal_inverse(const at::Tensor& base, const at::Tensor& mutated_view, int64_t level) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call _fw_primal() during the functionalization pass. For now, this is not supported.");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::view_as_real_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return at::view_as_complex(mutated_view);
+}
+
+Tensor FunctionalInverses::view_as_complex_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return at::view_as_real(mutated_view.resolve_conj());
+}
+
+Tensor FunctionalInverses::_conj_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return mutated_view.conj();
+}
+
+Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return mutated_view.neg();
+}
+
+Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) {
+    TORCH_INTERNAL_ASSERT(false, "as_strided has not been implemented in the functionalization pass yet");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::diagonal_inverse(const Tensor& base, const Tensor& mutated_view, int64_t offset, int64_t dim1, int64_t dim2) {
+    return base.diagonal_scatter(mutated_view, offset, dim1, dim2);
+}
+
+Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, bool implicit) {
+    return at::sum_to(mutated_view, base.sizes());
+}
+
+Tensor FunctionalInverses::permute_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef dims) {
+    return at::functionalization::permute_inverse(mutated_view, dims);
+}
+
+Tensor FunctionalInverses::_reshape_alias_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride) {
+    // Note that I'm directly calling reshape(), and ignoring the strides.
+    // _reshape_alias() isn't available from user code, and is an implementation detail of reshape().
+    // Specifically, passing in the strides directly can get us into trouble in cases like:
+    // b = a[0]; c = b.reshape(...); c.add_(1); print(a)
+    // When we eventually run the _reshape_alias_inverse() call here, if we were to pass in both sizes and strides,
+    // The call would fail because `mutated_view` doesn't have enough bytes of storage.
+    return mutated_view.reshape(base.sizes());
+}
+
+Tensor FunctionalInverses::select_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, int64_t index) {
+    return base.select_scatter(mutated_view, dim, index);
+}
+Tensor FunctionalInverses::detach_inverse(const Tensor& base, const Tensor& mutated_view) {
+    // the functionalization pass doesn't care about autograd metadata - as a view, I think detach() is just an identity function
+    return mutated_view;
+}
+
+Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
+    return base.slice_scatter(mutated_view, dim, start, end, step);
+}
+
+Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t split_size, int64_t dim) {
+    // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
+    // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
+    // on top of the base tensor.
+    // For autograd, we have all of the tensors outputted by split() and we just want to stack them.
+    dim = at::maybe_wrap_dim(dim, base.sizes().size());
+    auto dim_size = base.size(dim);
+    auto start = mutated_view_idx * split_size;
+    auto end = start + split_size;
+    if (end > dim_size) end = dim_size;
+    return base.slice_scatter(mutated_view, dim, start, end, 1);
+}
+
+Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) {
+    dim = at::maybe_wrap_dim(dim, base.sizes().size());
+    auto dim_size = base.size(dim);
+    int64_t start = 0;
+    for (auto i = 0; i < mutated_view_idx; ++i) {
+        start += split_sizes[i];
+    }
+    auto end = start + split_sizes[mutated_view_idx];
+    if (end > dim_size) end = dim_size;
+    return base.slice_scatter(mutated_view, dim, start, end, 1);
+}
+
+Tensor FunctionalInverses::squeeze_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return unsqueeze_to(mutated_view, base.sizes());
+}
+
+Tensor FunctionalInverses::squeeze_dim_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
+    return unsqueeze_to(mutated_view, dim, base.sizes());
+}
+
+Tensor FunctionalInverses::t_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return mutated_view.t();
+}
+
+Tensor FunctionalInverses::transpose_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim0, int64_t dim1) {
+    return mutated_view.transpose(dim0, dim1);
+}
+
+Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) {
+    return mutated_view.squeeze(dim);
+}
+
+Tensor FunctionalInverses::_indices_inverse(const Tensor& base, const Tensor& mutated_view) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call _indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::_values_inverse(const Tensor& base, const Tensor& mutated_view) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call _values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::indices_inverse(const Tensor& base, const Tensor& mutated_view) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::values_inverse(const Tensor& base, const Tensor& mutated_view) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::crow_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call crow_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::col_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) {
+    TORCH_INTERNAL_ASSERT(false, "Attempted to call col_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization");
+    return Tensor();
+}
+
+Tensor FunctionalInverses::unbind_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t dim) {
+    dim = at::maybe_wrap_dim(dim, base.sizes().size());
+    return base.select_scatter(mutated_view, dim, mutated_view_idx);
+}
+
+Tensor FunctionalInverses::view_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) {
+    return mutated_view.view(base.sizes());
+}
+
+Tensor FunctionalInverses::view_dtype_inverse(const Tensor& base, const Tensor& mutated_view, at::ScalarType dtype) {
+    return mutated_view.view(base.scalar_type());
+}
+
+Tensor FunctionalInverses::unfold_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dimension, int64_t size, int64_t step) {
+    // I think autograd and the functionalization pass want the exact same thing here, but need to test to confirm.
+    return unfold_backward(mutated_view, base.sizes(), dimension, size, step);
+}
+
+Tensor FunctionalInverses::alias_inverse(const Tensor& base, const Tensor& mutated_view) {
+    return mutated_view;
+}
+
+} // functionalization
+} // at
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -0,0 +1,117 @@
+#include <ATen/FunctionalStorageImpl.h>
+
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <c10/util/Exception.h>
+#include <vector>
+
+namespace at {
+namespace functionalization {
+
+ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
+  if (out_idx == this->out_index) return *this;
+  return ViewMeta(forward_fn, reverse_fn, out_idx);
+}
+
+Alias::Alias(const at::Tensor& base) {
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base));
+  base_ = base;
+}
+
+const at::Tensor& Alias::base() const {
+  return base_;
+}
+
+void Alias::add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+  updates_.push_back({updated_val, metas});
+  generation_++;
+}
+
+// Note [Functionalization: Alias Removal Part 2]
+// See Note [Functionalization: Alias Removal] for more details.
+// This function applies a single update from one of the views to the Alias object.
+// We start out with <original_base> and <mutated_view>, and our goal is to end up with <mutated_base>.
+// Consider this program:
+//
+// base = ...
+// a = base.view1()
+// b = a.view2()
+// c = b.view3()
+// c.add_(3)
+//
+// Then the functionalization pass will queue an update as follows:
+//
+// update.new_val = c  # the updated value of c
+// update.view_metas = [view1_meta, view2_meta, view3_meta]
+//
+// Syncing any of a, b or c will eventually call apply_update() on the alias, and the following will run:
+//
+// tmp_values = [base, a, b]  # NB: c is not necessary
+// t = update.new_val
+// t = view3_inverse(b, t, 0)  # 0 is output index, these are all single output views so it's 0
+// t = view2_inverse(a, t, 0)
+// t = view1_inverse(base, t, 0)  # t now represents the updated alias.
+// alias.base_ = t
+const Tensor apply_update(const Alias::Update& update, const Tensor& base) {
+  at::Tensor t = update.new_val;
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
+  std::vector<at::Tensor> tmp_values({base});
+  for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
+    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
+    // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
+    // All of these ops require additional information to recover the sizes of the original tensor.
+    // If need to, we could probably apply this optimization and only bother computing tmp_values
+    // for those necessary view ops.
+    tmp_values.push_back(std::move(next_view));
+  }
+  for(int i = update.view_metas.size()-1; i >= 0; --i) {
+    int64_t out_idx = update.view_metas[i].out_index;
+    // Each view inverse is implemented in ViewInverses.cpp.
+    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
+  }
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
+  return t;
+}
+
+void Alias::apply_updates() {
+  // N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point.
+  // The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack.
+  // It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
+  // which means that we need to explicitly exclude it here before doing any other work underneath the pass.
+  at::AutoDispatchSkipFunctionalize guard;
+  for (auto& update_data: updates_) {
+    base_ = apply_update(update_data, base_);
+  }
+  updates_.clear();
+}
+
+FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value)
+  : c10::StorageImpl(
+      c10::StorageImpl::use_byte_size_t(),
+      value.numel() * value.dtype().itemsize(),
+      DataPtr{nullptr, value.device()},
+      // Using a null allocator, since FunctionalTensorImpl's aren't resizeable.
+      nullptr,
+      /*resizeable=*/false
+    ),
+    alias_(Alias(value))
+  {}
+
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas) {
+  alias_.add_update(updated_val, view_metas);
+}
+
+void FunctionalStorageImpl::apply_updates() {
+  alias_.apply_updates();
+}
+
+const Tensor& FunctionalStorageImpl::base() {
+  return alias_.base();
+}
+
+size_t FunctionalStorageImpl::generation() const {
+  return alias_.generation();
+}
+
+} // namespace functionalization
+} // namespace at
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -0,0 +1,110 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace at {
+namespace functionalization {
+
+// See Note [Functionalization Pass In Core]
+
+// ViewMeta is a class used by the functionalization pass to navigate between
+// a base tensor and a view tensor.
+// For example, if I call `b = a.view1(...)`
+// the functionalization pass will generate and store a ViewMeta on b that looks like:
+//
+// ViewMeta(
+//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
+//     return base.view1(...);
+//   },
+//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view, int64_t mutated_view_idx) -> at::Tensor {
+//     return at::functionalization::impl::view1_inverse(base, mutated_view, ...);
+//   }
+//
+// The forward_fn lambda describes how to replay view1 on a tensor.
+//
+// The reverse_fn lambda describes how, given a tensor that is already a view, how to get the corresponding base tensor.
+// See Note [Functionalization Pass: View Inverses] for details.
+struct ViewMeta {
+  ViewMeta(
+          std::function<Tensor(const Tensor&, int64_t)> forward,
+          std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
+          int64_t out_idx = 0) :
+      forward_fn(forward),
+      reverse_fn(reverse),
+      out_index(out_idx)
+      {}
+
+  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
+  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+  // See Note [out_idx in ViewMeta]
+  int64_t out_index;
+
+  // Returns a copy of the current ViewMeta, if out_idx matches the current out_index.
+  // Otherwise, returns a new ViewMeta with the same forward/reverse functions, but a new out index.
+  ViewMeta to_out_idx(int64_t out_idx);
+};
+
+// Alias represents the state shared by (potentially multiple) views of the same tensor.
+// For example, in the following code:
+//
+// b = a.view1(...)
+// c = b.view2(...)
+// b.add_(1)
+// --> alias.add_update(b, {view1_meta})
+//
+// The call to add_(1) will result in a call to alias.add_update(b, {view1_meta}), queueing up
+// the mutation from b onto the alias.
+// Later, suppose c is used in an expression (e.g. you try to print c, or pass it to an operator).
+// Doing so will involve "syncing" c.
+// First we apply any pending updates to the alias, and then we regenerate c
+// by replaying its views off of the updated alias. E.g:
+//
+// print(str(c))
+// --> c.sync_()
+//     --> alias.apply_updates() // after this, the alias will be updated to reflect the mutation to b
+class Alias {
+  public:
+    struct Update {
+        const at::Tensor& new_val;
+        const std::vector<ViewMeta>& view_metas;
+    };
+    explicit Alias(const at::Tensor& base);
+    const at::Tensor& base() const;
+    size_t generation() const { return generation_; }
+    void add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas);
+    void apply_updates();
+  private:
+    // NB: base_ should always point to a tensor BELOW the current functionalization layer.
+    // This is mainly to avoid reference cycles.
+    // e.g. given `b = a.view(...)`
+    // Both a.storage_ and b.storage_ are a FunctionStorageImpl containing an Alias, with contains a Tensor `base_`.
+    // In this case (where a and b are FunctionalTensorWrapper's), base_ should point not to a, but to a's unwrapped value, a.value_`
+    // See Note [Functionalization: Alias Removal] for a diagram that shows this visually.
+    at::Tensor base_;
+    std::vector<Update> updates_;
+    // generation_ gets incremented every time a mutation is queued onto the alias.
+    // It is used to determine if a given tensor is "up to date", or if it needs to be regenerated from the alias.
+    size_t generation_ = 0;
+};
+
+// FunctionalStorageImpl is a subclass of StorageImpl used by the functionalization pass.
+// It has no underlying data (similar to meta storage).
+// It also knows how to reflect mutations to tensors in the absence of a valid data pointer.
+// It does this by separately storing an Alias object, which knows how to reflect mutations
+// that may have happened to views of the original tensor.
+struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
+  explicit FunctionalStorageImpl(const Tensor& value);
+
+  void add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas);
+  void apply_updates();
+  const Tensor& base();
+  size_t generation() const;
+
+  ~FunctionalStorageImpl() override = default;
+
+ private:
+  at::functionalization::Alias alias_;
+};
+
+} // namespace functionalization
+} // namespace at
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -0,0 +1,377 @@
+
+#include <ATen/FunctionalTensorWrapper.h>
+
+#include <ATen/FunctionalInverses.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <c10/util/Exception.h>
+
+#include <c10/util/irange.h>
+
+namespace at {
+
+void FunctionalTensorWrapper::set_constructor_metadata() {
+  TORCH_INTERNAL_ASSERT(value_.defined());
+  // Note: "level" is a concept that we don't know how to compute in core.
+  // For now I'm retroactively setting this in functorch,
+  // but once Open Multiple Dispatch lands we should be able to calculate this in core.
+  level_ = -1;
+  // shallow_copy_from overwrites the storage and dispatch keyset...
+  auto functional_storage = storage_;
+  shallow_copy_from(value_.getIntrusivePtr());
+  storage_ = functional_storage;
+  storage_access_should_throw_ = false;
+  key_set_ = c10::DispatchKeySet(c10::DispatchKey::Functionalize) | value_.key_set();
+}
+
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
+  : c10::TensorImpl(
+      c10::Storage(c10::make_intrusive<functionalization::FunctionalStorageImpl>(value)),
+      c10::DispatchKeySet(DispatchKey::Functionalize) | value.key_set(),
+      value.dtype()
+    ),
+    value_(value)
+{
+  set_constructor_metadata();
+}
+
+// Note [Functionalization: Alias Removal]
+// When someone calls a view() op during the functionalization pass, e.g. 'b = a.view(...)',
+// we link `b` and `a` to a shared Alias object to preserve the aliasing relationship.
+//
+// How do we do that?
+//
+// Every FunctionalTensorWrapper contains a dummy FunctionalStorageImpl, which subclasses from c10::StorageImpl.
+// It doesn't contain any data (similar to MetaTensor storage), but it contains an Alias object that knows about the base tensor.
+// When a tensor is created through a view operation, both the new and old tensor point to the same FunctionalStorageImpl.
+//
+// As mutations are applied to any of the views, we also queue each mutation up on the Alias object, so we can replay them.
+// When the user requests a tensor that's had a view taken, we check if it's up to date.
+// If it's not up to date, we first replay all of the queued up mutations onto the alias, and then re-apply the current view
+// on top of the newly updated alias.
+//
+// Why do we queue up and lazily run mutations on the alias, instead of updating the alias eagerly?
+// This behavior was taken from pytorch/xla, which the alias-removal logic was inspired from.
+// One benefit of the laziness is that we save work in the cases where a user has multiple views and mutates one of them,
+// but never uses the other views later in the program (in which case we'll never update the alias).
+// It also has downsides though: repeatedly applying mutations to the same view without syncing
+// will silently use up more and more memory as more mutations are queued up.
+//
+// Corresponding diagram:
+//
+// b = a.view(...)
+//
+//        a                                                    b
+//        |                                                    |     If the user asks for b and it’s out of date,
+//       \/                                                    \/    We regenerate b by replaying it’s views from the alias.
+// . - - - - - - - - - - - - - .                    . - - - - - - - - - - - - - .
+// |  FunctionalTensorWrapper  |                    |  FunctionalTensorWrapper  |
+// . - - - - - - - - - - - - - .                    . - - - - - - - - - - - - - .
+// |     value   |   storage   |                    |    storage    |   Value   |
+// . - - - - - - - - - - - - - .                    . - - - - - - - - - - - - - .
+//          |                   \                  /                      |
+//          |                     \              /                        |
+//          |                       . - - - - - - - - - - - - .           |
+//          |                       |  FunctionalStorageImpl  |           |
+//          |                       . - - - - - - - - - - - - .           |
+//          |                       |         Alias           |           |
+//          |                       . - - - - - - - - - - - - .           |
+//          |                       /     mutations to a or b             |
+//          |                     /       are queued onto Alias           |
+//          |                   /                                         |
+//         \/                 /                                           \/
+// . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
+// |        TensorImpl         |                             |             TensorImpl        |
+// . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
+// |   value   |   storage     |                             |    storage    |     Value     |
+// . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
+//          |                                                             |
+//          |                                                             |
+//          |                                                             |
+//          |   In this picture the two tensor views their own storages,  |
+//          |   have their own storages, but backends like functorch      |
+//         \/   are allowed to re-alias underneath the pass               \/
+// . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
+// |    underyling_storage     |                             |      underyling_storage       |
+// . - - - - - - - - - - - - - .                             . - - - - - - - - - - - - - - - .
+//
+// This constructor is only used by view ops.
+// - view_value: The output tensor that we need to wrap.
+// - base: The "base" of the view that `view_value` was generated from.
+// See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta)
+  : c10::TensorImpl(
+      c10::DispatchKeySet(DispatchKey::Functionalize),
+      view_value.dtype(),
+      view_value.device()
+    ),
+    value_(view_value)
+{
+  set_constructor_metadata();
+  // Copy the original tensor's ViewMeta vector and push the current one.
+  if (base->view_metas_.size() > 0) {
+      view_metas_ = base->view_metas_;  // copy
+  }
+  view_metas_.push_back(meta);
+  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
+}
+
+functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
+  return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
+}
+
+void FunctionalTensorWrapper::commit_update() {
+  auto storage_impl = functional_storage_impl();
+  storage_impl->add_update(value_, view_metas_);
+  // Invariant: commit_update() is called during an inplace operation.
+  // Tensor inputs to the operation are synced before runnig the op,
+  // so the current tensor must be up-to-date with its alias at this point.
+  generation_ = storage_impl->generation();
+}
+
+bool FunctionalTensorWrapper::is_aliased() const {
+  // Two FunctionalTensorWrapper objects are aliased if they share storage.
+  // That means that we can check if a given FunctionalTensorWrapper is aliased
+  // by checking the reference count on its storage.
+  return storage_.use_count() > 1;
+}
+
+bool FunctionalTensorWrapper::is_up_to_date() const {
+  if (!is_aliased()) return true;
+  auto alias_generation = functional_storage_impl()->generation();
+  return generation_ == alias_generation;
+}
+
+// See Note [Functionalization Pass - Inplace View Ops]
+void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta meta) {
+  view_metas_.push_back(meta);
+  // Note [Functionalization Pass - Inplace View Ops]
+  // So, these ops are special - they're mutation AND view ops. They get special codegen.
+  // An example is transpose_, e.g. `a.transpose_()`
+  // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
+  // We also need to force a sync (even if a is already up to date), because a's underlying tensor hasn't actually
+  // been updated to reflect the new view yet.
+  regenerate_from_base();
+}
+
+// Note [Functionalization: Mutation Removal]
+// Mutation removal is used to take a program like this:
+//
+// a.add_(b)
+//
+// and replace it with a slightly different program that has the same semantics:
+//
+// tmp = a.add(b)
+// a.replace_(tmp)
+//
+// Where the replace_() call is implemented directly in the functionalization pass, so it is transparent to the backend.
+// This is useful for backends that aren't able to handle certain types of mutations, like functorch.
+//
+// Why do we need to wrap every tensor in a FunctionalTensorWrapper? Consider this program:
+//
+// Before:
+// tensor.add_(batched_tensor)
+//
+// After:
+// tmp = tensor.add(batched_tensor)
+// tensor.replace_(tmp)
+//
+// In the above, tmp is a batched tensor (because adding a normal tensor to a batched tensor does broadcasting and creates a batched tensor).
+// But we can't just replace the underlying memory backing `tensor` with `tmp` - a batched tensor takes up more space!
+// Instead, every input, intermediate and output of the program is wrapped in a FunctionalTensorImpl, which wraps the underlying tensor.
+void FunctionalTensorWrapper::replace_(const Tensor& other) {
+  // TODO: going to need to change this if we want nested functionalize() transforms.
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other));
+  value_ = other;
+}
+
+
+void FunctionalTensorWrapper::sync_() {
+  if (is_up_to_date()) {
+    return;
+  }
+  apply_updates();
+  regenerate_from_base();
+}
+
+void FunctionalTensorWrapper::regenerate_from_base() {
+  at::AutoDispatchSkipFunctionalize guard;
+  auto storage_impl = functional_storage_impl();
+  auto t = storage_impl->base();
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
+  // Reapply views to get the viewed tensor from the base in alias_
+  for (auto& view_meta: view_metas_) {
+    t = view_meta.forward_fn(t, view_meta.out_index);
+  }
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
+  replace_(t);
+  generation_ = storage_impl->generation();
+}
+
+void FunctionalTensorWrapper::apply_updates() {
+  // Apply all updates on alias_
+  auto storage_impl = functional_storage_impl();
+  storage_impl->apply_updates();
+}
+
+const char* FunctionalTensorWrapper::tensorimpl_type_name() const {
+    return "FunctionalTensorWrapper";
+}
+
+namespace functionalization {
+namespace impl {
+
+Tensor to_functional_tensor(const Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor));
+  return at::detail::make_tensor<FunctionalTensorWrapper>(tensor);
+}
+TensorList to_functional_tensor(const c10::List<Tensor>& t_list) {
+  std::vector<Tensor> outputs(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs[i] = to_functional_tensor(t_list[i]);
+  }
+  return outputs;
+}
+std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list) {
+  std::vector<Tensor> outputs(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs[i] = to_functional_tensor(t_list[i]);
+  }
+  return outputs;
+}
+TensorList to_functional_tensor(const TensorList& t_list) {
+  std::vector<Tensor> outputs(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs[i] = to_functional_tensor(t_list[i]);
+  }
+  return outputs;
+}
+
+Tensor from_functional_tensor(const Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(tensor));
+  auto impl = unsafeGetFunctionalWrapper(tensor);
+  return impl->value();
+}
+c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t) {
+  if (t.has_value()) {
+    return c10::make_optional<Tensor>(from_functional_tensor(*t));
+  }
+  return c10::nullopt;
+}
+c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list) {
+  c10::List<Tensor> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(from_functional_tensor(t_list[i]));
+  }
+  return outputs;
+}
+c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
+  c10::List<c10::optional<Tensor>> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(from_functional_tensor(t_list[i]));
+  }
+  return outputs;
+}
+TensorList from_functional_tensor(const TensorList& t_list) {
+  std::vector<Tensor> outputs(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(from_functional_tensor(t_list[i]));
+  }
+  return outputs;
+}
+
+void sync(const Tensor& t) {
+  if (t.unsafeGetTensorImpl()->is_wrapped_number()) {
+    // Unfortunately, we can't easily guarantee that wrapped numbers (scalar-tensors)
+    // get wrapped up in a FunctionalTensorWrapper object, since they skip the dispatcher.
+    // That shouldn't matter, since I don't think we're allowed to assign to wrapped numbers anyway.
+    return;
+  }
+  // Not every tensor that hits a functionalization kernel is necessarily a functional tensor.
+  // For example, xla_tensor.copy_(cpu_tensor) needs to hit the functionalization kernel
+  // to sync xla_tensor, but not cpu_tensor.
+  if (!at::functionalization::impl::isFunctionalTensor(t)) {
+    return;
+  }
+  auto functional_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
+  functional_impl->sync_();
+}
+void sync(const c10::optional<Tensor>& t) {
+  if (t.has_value()) {
+    sync(*t);
+  }
+}
+void sync(const c10::List<Tensor> t_list) {
+  for (const auto i : c10::irange(t_list.size())) {
+    sync(t_list[i]);
+  }
+}
+void sync(const at::TensorList t_list) {
+  for (auto t: t_list) {
+    sync(t);
+  }
+}
+void sync(const c10::List<c10::optional<Tensor>> t_list) {
+  for (const auto i : c10::irange(t_list.size())) {
+    sync(t_list[i]);
+  }
+}
+
+Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
+  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
+  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
+  if (out_idx != 0) {
+    // Note [out_idx in ViewMeta]
+    // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
+    // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
+    meta = meta.to_out_idx(out_idx);
+  }
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
+}
+
+std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<at::Tensor>& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
+  std::vector<Tensor> outputs(view_to_wrap.size());
+  for (const auto i : c10::irange(view_to_wrap.size())) {
+    outputs[i] = create_functional_tensor_with_view_meta(view_to_wrap[i], base, meta, i);
+  }
+  return outputs;
+}
+
+std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<at::Tensor>& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
+  std::vector<Tensor> outputs(view_to_wrap.size());
+  for (const auto i : c10::irange(view_to_wrap.size())) {
+    outputs[i] = create_functional_tensor_with_view_meta(view_to_wrap[i], base, meta, i);
+  }
+  return outputs;
+}
+
+void mutate_view_meta(const at::Tensor& self, functionalization::ViewMeta meta) {
+  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
+  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
+  self_impl->mutate_view_meta(meta);
+}
+
+// Note [Propagating strides in the functionalization pass]
+// In order to properly compute stride information, the functionalization pass
+// calls each {view} reference implementations with meta tensors.
+// The output meta tensor's stride info serves as a reference for what the correct strides should be.
+void set_sizes_strides_offset(const Tensor& out, const Tensor& reference_out) {
+  out.unsafeGetTensorImpl()->set_sizes_and_strides(reference_out.sizes(), reference_out.strides());
+  out.unsafeGetTensorImpl()->set_storage_offset(reference_out.storage_offset());
+}
+
+void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& reference_outs) {
+  TORCH_INTERNAL_ASSERT(outs.size() == reference_outs.size());
+  for (const auto i : c10::irange(reference_outs.size())) {
+    set_sizes_strides_offset(outs[i], reference_outs[i]);
+  }
+}
+
+
+} // namespace impl
+} // namespace functionalization
+} // namespace at
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -0,0 +1,155 @@
+
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/core/List.h>
+#include <ATen/FunctionalStorageImpl.h>
+
+#include <c10/core/DispatchKey.h>
+
+namespace at {
+
+// Note [Functionalization Pass In Core]
+// The Functionalization pass is used to remove aliasing from a pytorch program.
+//
+// This is useful for backends that don't support aliasing, like XLA and Vulkan.
+// It's also necessary in order to remove mutation from a program, which is needed in Functorch.
+//
+// Consider this program:
+// a = torch.ones(...)
+// b = a.view(...)
+// b.add_(1)
+//
+// In this program, b is meant to alias with a due to the use of view(). At the end of the program, both a and b are full of 2's.
+// However, backends that don't support aliasing aren't able to correctly implement the view() operator.
+// Instead, they can opt into the Functionalization pass, which will sit between the user and the backend,
+// and provide the necessary aliasing logic.
+//
+// The functionalization pass will turn the above program into a slightly different program that has the same semantics,
+// transparently to the user, that backends like XLA/Vulkan are able to implement
+// a = torch.ones(...)
+// b = a.view_copy(...)  # view() replaced with view_copy(). Backends like XLA/Vulkan can implement this!
+// b.add_(1)
+// a.add_(1)  # Our functionalization pass machinery knows that a and b are aliased - it applies b's mutation to a too.
+//
+// So, how does the functionalization pass keep track of which tensors are aliased?
+// The pass works by wrapping EVERY tensor in the program inside of a FunctionalTensorWrapper, which knows about its alias'd tensors.
+//
+// See Note [Functionalization: Alias Removal] for details on the aliasing machinery.
+// See Note [Functionalization: Mutation Removal] for details on mutation removal.
+
+struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
+  explicit FunctionalTensorWrapper(const Tensor& value);
+  // Additional constructor to create a FunctionalTensorWrapper directly from an underlying tensor that was created from a view.
+  // For example, the code b = a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a, view1_meta)
+  explicit FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta);
+
+  // Get the underlying, actual tensor, that doesn't know anything about functionalization.
+  const Tensor& value() const { return value_; };
+  // The concept of "level" is only ever important to functorch; it's exposed here
+  // as more of a hook for functorch to use.
+  int64_t level() const { return level_; };
+  void set_level(int64_t level) { level_ = level; }
+
+  // Sync's the underlying tensor with its alias, if it's out of date. This involves two steps:
+  // 1) Apply any pending updates/mutations to the alias
+  // 2) Replay the views (if any) to regenerate the current tensor off of the updated alias.
+  void sync_();
+  // Performs step (1) of the sync. This is its own public API because it's needed by view_inplace ops like transpose_.
+  // See Note [Functionalization Pass - Inplace View Ops]
+  void regenerate_from_base();
+  // Performs step (2) of the sync. This is its own public API because it's needed by functorch.
+  // functorch wants to make sure that all input tensors to a functionalized program have been properly synced
+  // so it can properly propagate mutations to inputs.
+  // It can't just call sync_(), because the FunctionalTensorWrapper will look like it has no aliases and sync_ will be a noop.
+  // We use the reference count on storage_ to determine if the wrapper is aliased, and by the time functorch
+  // is ready to propagate updates to inputs, any intermediate views of the input created by the program will have been deallocated.
+  void apply_updates();
+  // Takes the current state of value_ and snapshots it, sending it as a pending update to the alias.
+  void commit_update();
+  // When any tensor is mutated, the tensor increments its alias's "generation".
+  // Separately, each tensor maintains its own "generation" counter, which is used to determine if it's up-to-date with its alias.
+  // The act of syncing a tensor will set a tensor's generation equal to its alias's generation.
+  bool is_up_to_date() const;
+  // Every FunctionalTensorWrapper contains a vector<ViewMeta> objects describing the series of view ops that ran
+  // to generate the current tensor from the base tensor.
+  // This method is used by inplace-view ops like transpose_.
+  // It appends a ViewMeta to the existing stack, and refreshes the tensor by replaying the views off of the alias.
+  void mutate_view_meta(at::functionalization::ViewMeta meta);
+
+  // The functionalization pass can be used to remove mutations.
+  // It does so by replacing any mutation op with it's corresponding out-of-place op, followed by a call to replace_().
+  // e.g:
+  //
+  // a.add_(1)
+  //
+  // will turn into:
+  //
+  // tmp = a.add(1)
+  // a.replace_(tmp)
+  //
+  // replace_() swaps out the wrapped tensor, value_, with tmp.
+  void replace_(const Tensor& other);
+
+  ~FunctionalTensorWrapper() override = default;
+
+ private:
+  const char* tensorimpl_type_name() const override;
+  // Returns true if this FunctionalTensorWrapper is aliased with any other FunctionalTensorWrapper objects.
+  // During a functionalization pass, if we have `b = a.view()`, then a and b should both report as aliased.
+  bool is_aliased() const;
+  void set_constructor_metadata();
+  functionalization::FunctionalStorageImpl* functional_storage_impl() const;
+
+  // Note that value is not taken by reference: internally, the wrapper will change the value tensor that it points to over time.
+  Tensor value_;
+  int64_t level_;
+
+  size_t generation_ = 0;
+  std::vector<at::functionalization::ViewMeta> view_metas_;
+};
+
+// Utility functions for the functionalization pass.
+
+namespace functionalization {
+namespace impl {
+
+TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(const Tensor& tensor) {
+  auto functional_impl = static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr);
+  return functional_impl;
+}
+
+TORCH_API inline bool isFunctionalTensor(const at::Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize);
+}
+
+TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
+TORCH_API TensorList to_functional_tensor(const c10::List<Tensor>& t_list);
+TORCH_API std::vector<Tensor> to_functional_tensor(const std::vector<Tensor>& t_list);
+TORCH_API TensorList to_functional_tensor(const TensorList& t_list);
+
+TORCH_API Tensor from_functional_tensor(const Tensor& tensor);
+TORCH_API c10::optional<Tensor> from_functional_tensor(const c10::optional<Tensor>& t);
+TORCH_API c10::List<Tensor> from_functional_tensor(const c10::List<Tensor>& t_list);
+TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API TensorList from_functional_tensor(const TensorList& tensors);
+
+TORCH_API void sync(const at::Tensor& t);
+TORCH_API void sync(const c10::optional<Tensor>& t);
+TORCH_API void sync(const c10::List<Tensor> t_list);
+TORCH_API void sync(const at::TensorList t_list);
+TORCH_API void sync(const c10::List<c10::optional<Tensor>> t_list);
+
+Tensor create_functional_tensor_with_view_meta(const Tensor& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta, int64_t out_idx = 0);
+std::vector<Tensor> create_functional_tensor_with_view_meta(const c10::List<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
+std::vector<Tensor> create_functional_tensor_with_view_meta(const std::vector<Tensor>& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta);
+
+void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta);
+
+void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
+void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector<Tensor>& meta_outs);
+
+} // namespace impl
+} // namespace functionalization
+} // namespace at
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@ -84,6 +84,13 @@ struct TORCH_API AutoNonVariableTypeMode {
  c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
 };

+struct TORCH_API AutoDispatchSkipFunctionalize {
+  AutoDispatchSkipFunctionalize() :
+    dispatch_key_guard_(c10::DispatchKeySet(c10::DispatchKey::Functionalize)) {
+  }
+  c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
+};
+
 /* Note [AutoDispatchBelowADInplaceOrView]
 * AutoDispatchBelowADInplaceOrView is equivalent to AutoNonVariableTypeMode
 * before we split inplace & view ops out of VariableType kernel.
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -1,6 +1,8 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/Formatting.h>
 #include <ATen/core/VariableHooksInterface.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/FunctionalTensorWrapper.h>

 #include <iostream>

@ -113,7 +115,7 @@ const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
  return *this;
 }

-// View Variables
+// View Methods
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 bool TensorBase::is_view() const {
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@ -1,5 +1,6 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/FunctionalTensorWrapper.h>
 #include <torch/library.h>

 /*
@ -60,6 +61,55 @@ TORCH_LIBRARY_IMPL(_, AutogradMLC, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
 }

+namespace {
+  void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) {
+    const auto& schema = op.schema();
+    TORCH_INTERNAL_ASSERT(!schema.hasAnyAliasInfo(), "mutating and aliasing ops should all have codegen'd kernels");
+    const auto num_arguments = schema.arguments().size();
+    const auto arguments_begin = stack->size() - num_arguments;
+    auto arguments = torch::jit::last(stack, num_arguments);
+
+    for (uint64_t idx = 0; idx < num_arguments; ++idx) {
+      const auto& ivalue = arguments[idx];
+      if (ivalue.isTensor()) {
+        at::Tensor t = ivalue.toTensor();
+        at::functionalization::impl::sync(t);
+        auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
+        (*stack)[arguments_begin + idx] = t_new;
+      } else if (ivalue.isTensorList()) {
+        auto tensors = ivalue.toTensorList();
+        at::functionalization::impl::sync(tensors);
+        auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors));
+        (*stack)[arguments_begin + idx] = t_new;
+      }
+    }
+    {
+      at::AutoDispatchSkipFunctionalize guard;
+      op.redispatchBoxed(dispatchKeySet & c10::after_func_keyset, stack);
+    }
+    const auto num_returns = schema.returns().size();
+    const auto returns_begin = stack->size() - num_returns;
+    auto returns = torch::jit::last(stack, num_returns);
+
+    for (uint64_t idx = 0; idx < num_returns; ++idx) {
+      const auto& ivalue = returns[idx];
+      if (ivalue.isTensor()) {
+        at::Tensor t = ivalue.toTensor();
+        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
+        (*stack)[returns_begin + idx] = t_new;
+      } else if (ivalue.isTensorList()) {
+        auto tensors = ivalue.toTensorList();
+        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(tensors));
+        (*stack)[returns_begin + idx] = t_new;
+      }
+    }
+  }
+}
+
+TORCH_LIBRARY_IMPL(_, Functionalize, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&functionalizeFallback>());
+}
+
 // see Note [ADInplaceOrView key]
 TORCH_LIBRARY_IMPL(_, ADInplaceOrView, m) {
      m.fallback(torch::CppFunction::makeFallthrough());
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@ -197,6 +197,8 @@ _(aten, atan2) \
 _(aten, atleast_1d) \
 _(aten, atleast_2d) \
 _(aten, atleast_3d) \
+_(aten, _autocast_to_reduced_precision) \
+_(aten, _autocast_to_full_precision) \
 _(aten, avg_pool1d) \
 _(aten, avg_pool2d) \
 _(aten, avg_pool2d_backward) \
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@ -23,22 +23,14 @@ struct BuiltinOpFunction : public Function {
    TORCH_INTERNAL_ASSERT(schema_.returns().size() == 1);
  }

-  const std::string& doc_string() const override {
+  c10::string_view doc_string() const override {
    return doc_string_;
  }

-  bool isGraphFunction() const override {
-    return false;
-  }
-
  void run(Stack& stack) override {
    callable_(stack);
  }

-  void run(Stack&& stack) override {
-    callable_(stack);
-  }
-
  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
      Stack& stack,
      TaskLauncher /* not used */) override {
@ -48,50 +40,15 @@ struct BuiltinOpFunction : public Function {
    return res;
  }

-  at::IValue operator()(std::vector<at::IValue> stack, const Kwargs& kwargs)
-      override {
-    getSchema().checkAndNormalizeInputs(stack, kwargs);
-    callable_(stack);
-    return stack.front();
-  }
-
  const c10::QualifiedName& qualname() const override {
    return name_;
  }

-  const std::string& name() const override {
-    return name_.name();
-  }
-
  // if this isn't yet defined, run its method_creator function
  void ensure_defined() override {
    // nop
  }

-  std::shared_ptr<Graph> graph() const override {
-    TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
-      "from it. This probably indicates that the JIT calling context needs a "
-      "special case on Function::isGraphFunction()");
-  }
-
-  std::shared_ptr<Graph> optimized_graph() const override {
-    TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
-      "from it. This probably indicates that the JIT calling context needs a "
-      "special case on Function::isGraphFunction()");
-  }
-
-  void clear_execution_info() override {
-    TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a graph requested "
-      "from it. This probably indicates that the JIT calling context needs a "
-      "special case on Function::isGraphFunction()");
-  }
-
-  GraphExecutor& get_executor() override {
-    TORCH_INTERNAL_ASSERT(false , "BuiltinFunction had a GraphExecutor requested "
-      "from it. This probably indicates that the JIT calling context needs a "
-      "special case on Function::isGraphFunction()");
-  }
-
  const c10::FunctionSchema& getSchema() const override {
    return schema_;
  }
@ -100,29 +57,21 @@ struct BuiltinOpFunction : public Function {
    return schema_.arguments().size();
  }

-  void check_single_output() override {
-    TORCH_CHECK(schema_.returns().size() == 1);
-  }
-
-  std::string pretty_print_schema() const override {
-    #ifdef __NVCC__
-    // Disable the "statement is unreachable" warning
-    #pragma diag_suppress code_is_unreachable
-    #endif
-
-    TORCH_INTERNAL_ASSERT(false);
-    return "";
-
-    #ifdef __NVCC__
-    #pragma diag_default code_is_unreachable
-    #endif
-  }
-
  Function& setSchema(c10::FunctionSchema schema) override {
    schema_ = std::move(schema);
    return *this;
  }

+  bool call(Stack& stack, size_t, c10::function_ref<void(const Code&)>) override {
+    run(stack);
+    return false;
+  }
+
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>) override {
+    run(stack);
+    return false;
+  }
+
  ~BuiltinOpFunction() override {}

 private:
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@ -1,8 +1,10 @@
 #pragma once
+
 #include <ATen/core/function_schema.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/qualified_name.h>
-#include <mutex>
+#include <c10/util/Exception.h>
+#include <c10/util/FunctionRef.h>

 namespace c10 {
 struct FunctionSchema;
@ -16,7 +18,11 @@ namespace torch {
 namespace jit {

 struct Graph;
-struct GraphExecutor;
+struct Code;
+
+namespace mobile {
+struct Code;
+}

 using Stack = std::vector<at::IValue>;
 using Kwargs = std::unordered_map<std::string, at::IValue>;
@ -30,50 +36,70 @@ TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph);
 // execution of the function. Method is a wrapper around an
 // underlying Function that also provides a `self` object.
 struct TORCH_API Function {
-  virtual const std::string& doc_string() const {
-    static const std::string no_doc_string = "";
+  virtual c10::string_view doc_string() const {
+    static constexpr c10::string_view no_doc_string = "";
    return no_doc_string;
  }

-  virtual bool isGraphFunction() const = 0;
+  virtual bool isGraphFunction() const {
+    return false;
+  }

  virtual void run(Stack& stack) = 0;

-  virtual void run(Stack&& stack) = 0;
-
  virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
      Stack& stack,
-      TaskLauncher taskLauncher = at::launch) = 0;
+      TaskLauncher taskLauncher = at::launch) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return {};
+  }

-  virtual at::IValue operator()(
-      std::vector<at::IValue> stack,
-      const Kwargs& kwargs = Kwargs()) = 0;
+  at::IValue operator()(
+    Stack stack,
+    const Kwargs& kwargs = Kwargs()) {
+    getSchema().checkAndNormalizeInputs(stack, kwargs);
+    run(stack);
+    return stack.front();
+  }

  virtual const c10::QualifiedName& qualname() const = 0;

-  virtual const std::string& name() const = 0;
+  const std::string& name() const {
+    return qualname().name();
+  }

  // if this isn't yet defined, run its method_creator function
  virtual void ensure_defined() = 0;

-  virtual std::shared_ptr<Graph> graph() const = 0;
-
-  virtual std::shared_ptr<Graph> optimized_graph() const = 0;
-
-  virtual void clear_execution_info() = 0;
-
-  virtual GraphExecutor& get_executor() = 0;
-
  virtual const c10::FunctionSchema& getSchema() const = 0;

  virtual size_t num_inputs() const = 0;

-  virtual void check_single_output() = 0;
-
-  virtual std::string pretty_print_schema() const = 0;
-
  virtual Function& setSchema(c10::FunctionSchema schema) = 0;

+  // call() defines how different interpreter implementations interacts with
+  // Function objects. Basically interpreters need to provide a callback to
+  // communicate to Functions what to do if provided a Code object.
+  // Alternatively we could design the signature to return an optional Code
+  // object, but that requires special handling the null case in interpreter
+  // and the fallback behavior is not well defined by interpreter but rather
+  // Function themselves, so a callback approach is more reasonable than
+  // returning values.
+  // If call() returns true, then callback completes successfully, otherwise
+  // call() returns false.
+
+  // Overload for server interpreter, a bailout size is needed for graph executor.
+  virtual bool call(Stack&, size_t, c10::function_ref<void(const Code&)>) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return false;
+  }
+
+  // Overload for mobile interpreter.
+  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return false;
+  }
+
  virtual ~Function() {}
 };
 } // namespace jit
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -172,6 +172,10 @@ inline bool operator==(const Argument& lhs, const Argument& rhs) {
                   && *lhs.alias_info() == *rhs.alias_info()));
 }

+inline bool operator!=(const Argument& lhs, const Argument& rhs) {
+  return !(lhs == rhs);
+}
+
 bool operator==(const FunctionSchema& lhs, const FunctionSchema& rhs);

 struct FunctionSchema {
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -463,6 +463,7 @@ namespace c10 {
  _(attr, df_output_vjps)            \
  _(attr, axes)                      \
  _(attr, axis)                      \
+  _(attr, symbolic_shape_inputs)     \
  _(attr, broadcast)                 \
  _(attr, direction)                 \
  _(attr, ends)                      \
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -1937,14 +1937,14 @@ struct TORCH_API ClassAttribute {
  TypePtr attributeType,
  std::string attributeName) :
    kind_(kind),
-    attributeType_(attributeType),
+    attributeType_(std::move(attributeType)),
    attributeName_(std::move(attributeName)) {}

  AttributeKind getKind() const {
    return kind_;
  }

-  TypePtr getType() const {
+  const TypePtr& getType() const {
    return attributeType_;
  }

@ -2026,22 +2026,22 @@ struct TORCH_API ClassType : public NamedType {
    return attributes_[pos].getType();
  }

-  TypePtr getAttribute(const std::string& name) const {
-    auto type = findAttribute(name);
+  const TypePtr& getAttribute(const std::string& name) const {
+    auto slot = findAttributeSlot(name);
    TORCH_CHECK(
-        type,
+        slot,
        repr_str(),
        " does not have an attribute with name '",
        name,
        "'");
-    return type;
+    return attributes_[*slot].getType();
  }

  size_t numAttributes() const {
    return attributes_.size();
  }

-  TypePtr getAttribute(size_t slot) const {
+  const TypePtr& getAttribute(size_t slot) const {
    AT_ASSERT(slot < attributes_.size());
    return attributes_.at(slot).getType();
  }
@ -2059,7 +2059,7 @@ struct TORCH_API ClassType : public NamedType {
  c10::optional<size_t> findAttributeSlot(const std::string& name) const {
    size_t slot = 0;
    for (const auto& attr : attributes_) {
-      if (name.compare(attr.getName()) == 0) {
+      if (name == attr.getName()) {
        return slot;
      }
      slot++;
@ -2094,7 +2094,7 @@ struct TORCH_API ClassType : public NamedType {

  size_t addAttribute(
      const std::string& name,
-      const TypePtr& type,
+      TypePtr type,
      bool is_parameter = false,
      bool is_buffer = false);

@ -2122,7 +2122,7 @@ struct TORCH_API ClassType : public NamedType {
      bool is_buffer = false) {
    auto slot_idx = findAttributeSlot(name);
    if (!slot_idx) {
-      return addAttribute(name, ty, is_parameter, is_buffer);
+      return addAttribute(name, std::move(ty), is_parameter, is_buffer);
    }

    TORCH_CHECK(
@ -2130,7 +2130,7 @@ struct TORCH_API ClassType : public NamedType {
        "Parameter field mismatch for the field '",
        name,
        "'");
-    TypePtr atype = getAttribute(*slot_idx);
+    const TypePtr& atype = getAttribute(*slot_idx);
    TORCH_CHECK(
      ty->isSubtypeOf(*atype),
      ty->repr_str(),
@ -2227,7 +2227,7 @@ struct TORCH_API ClassType : public NamedType {
    AT_ASSERT(numAttributes() == contained_types.size());
    for(size_t i = 0; i < attributes_.size(); ++i) {
      AT_ASSERT(attributes_[i].getType()->isSubtypeOf(*contained_types[i]));
-      ptr->addAttribute(attributes_[i].getName(), contained_types[i]);
+      ptr->addAttribute(attributes_[i].getName(), std::move(contained_types[i]));
    }
    // Copy methods over
    for (const auto& method : methods()) {
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -2228,7 +2228,7 @@ void ClassType::addAttribute(ClassAttribute classAttribute) {

 size_t ClassType::addAttribute(
    const std::string& name,
-    const TypePtr& type,
+    TypePtr type,
    bool is_parameter,
    bool is_buffer) {
  if (is_parameter && is_buffer){
@ -2248,16 +2248,13 @@ size_t ClassType::addAttribute(
    kind = AttributeKind::BUFFER;
  }

-  ClassAttribute ClassAttribute(kind, type, name);
-
-  addAttribute(ClassAttribute);

  if (is_parameter || is_buffer) {
    TORCH_INTERNAL_ASSERT(is_module(), "adding a parameter or buffer to a non module");
    TORCH_CHECK(
        (type->kind() == TensorType::Kind) ||
            (type->kind() == OptionalType::Kind &&
-            type->expect<OptionalType>()->getElementType()->kind() ==
+            type->expectRef<OptionalType>().getElementType()->kind() ==
                TensorType::Kind) ||
            (type->kind() == UnionType::Kind &&
            TensorType::get()->isSubtypeOf(type->expectRef<UnionType>())) ||
@ -2266,6 +2263,8 @@ size_t ClassType::addAttribute(
        toString(type));
  }

+  addAttribute(ClassAttribute(kind, std::move(type), name));
+
  return slot;
 }

--- a/aten/src/ATen/cpu/vec/functional.h
+++ b/aten/src/ATen/cpu/vec/functional.h
@ -1,6 +1,4 @@
 #pragma once

 #include <ATen/cpu/vec/functional_base.h>
-#if !defined(__VSX__)  || !defined(CPU_CAPABILITY_VSX)
 #include <ATen/cpu/vec/functional_bfloat16.h>
-#endif
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+namespace at {
+namespace vec {
+// See Note [Acceptable use of anonymous namespace in header]
+namespace {
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<BFloat16>& a) {
+  constexpr int64_t K = Vectorized<BFloat16>::size();
+  __at_align__ float arr[K];
+  __at_align__ BFloat16 arr2[K];
+  a.store(arr2);
+  convert(arr2, arr, K);
+  return std::make_tuple(
+      Vectorized<float>::loadu(arr),
+      Vectorized<float>::loadu(arr + Vectorized<float>::size()));
+}
+
+inline Vectorized<BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  constexpr int64_t K = Vectorized<BFloat16>::size();
+  __at_align__ float arr[K];
+  __at_align__ BFloat16 arr2[K];
+  a.store(arr);
+  b.store(arr + Vectorized<float>::size());
+  convert(arr, arr2, K);
+  return Vectorized<BFloat16>::loadu(arr2);
+}
+
+void load_fp32_from_bf16(const c10::BFloat16* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (int k = 0; k < Vectorized<float>::size(); ++k) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+C10_UNUSED void load_fp32_from_bf16(
+    const c10::BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_bf16(data, out1);
+  data += Vectorized<float>::size();
+  load_fp32_from_bf16(data, out2);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
@ -3,6 +3,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+
+// Note: header order is important here
 #include <ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h>
@ -11,8 +13,12 @@
 #include <ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h>
+
 #include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
+
+#include <ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h>
+
 namespace at {
 namespace vec {

--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@ -10,3 +10,17 @@
 #else
 #define AT_USE_CUSPARSE_GENERIC_API() 0
 #endif
+
+// cuSparse Generic API spsv function was added in CUDA 11.3.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
+#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_SPSV() 0
+#endif
+
+// cuSparse Generic API spsv function was added in CUDA 11.3.1
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11600)
+#define AT_USE_CUSPARSE_GENERIC_SPSM() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_SPSM() 0
+#endif
--- a/aten/src/ATen/cuda/CUDASparseBlas.cpp
+++ b/aten/src/ATen/cuda/CUDASparseBlas.cpp
@ -0,0 +1,219 @@
+/*
+  Provides the implementations of cuSPARSE function templates.
+*/
+
+#include <ATen/cuda/CUDASparseBlas.h>
+
+namespace at {
+namespace cuda {
+namespace sparse {
+
+template <>
+void csrgeam2_bufferSizeExt<float>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float)) {
+  TORCH_CUDASPARSE_CHECK(cusparseScsrgeam2_bufferSizeExt(
+      handle,
+      m,
+      n,
+      alpha,
+      descrA,
+      nnzA,
+      csrSortedValA,
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      beta,
+      descrB,
+      nnzB,
+      csrSortedValB,
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      csrSortedValC,
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBufferSizeInBytes));
+}
+
+template <>
+void csrgeam2_bufferSizeExt<double>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double)) {
+  TORCH_CUDASPARSE_CHECK(cusparseDcsrgeam2_bufferSizeExt(
+      handle,
+      m,
+      n,
+      alpha,
+      descrA,
+      nnzA,
+      csrSortedValA,
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      beta,
+      descrB,
+      nnzB,
+      csrSortedValB,
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      csrSortedValC,
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBufferSizeInBytes));
+}
+
+template <>
+void csrgeam2_bufferSizeExt<c10::complex<float>>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>)) {
+  TORCH_CUDASPARSE_CHECK(cusparseCcsrgeam2_bufferSizeExt(
+      handle,
+      m,
+      n,
+      reinterpret_cast<const cuComplex*>(alpha),
+      descrA,
+      nnzA,
+      reinterpret_cast<const cuComplex*>(csrSortedValA),
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      reinterpret_cast<const cuComplex*>(beta),
+      descrB,
+      nnzB,
+      reinterpret_cast<const cuComplex*>(csrSortedValB),
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      reinterpret_cast<const cuComplex*>(csrSortedValC),
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBufferSizeInBytes));
+}
+
+template <>
+void csrgeam2_bufferSizeExt<c10::complex<double>>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>)) {
+  TORCH_CUDASPARSE_CHECK(cusparseZcsrgeam2_bufferSizeExt(
+      handle,
+      m,
+      n,
+      reinterpret_cast<const cuDoubleComplex*>(alpha),
+      descrA,
+      nnzA,
+      reinterpret_cast<const cuDoubleComplex*>(csrSortedValA),
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      reinterpret_cast<const cuDoubleComplex*>(beta),
+      descrB,
+      nnzB,
+      reinterpret_cast<const cuDoubleComplex*>(csrSortedValB),
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      reinterpret_cast<const cuDoubleComplex*>(csrSortedValC),
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBufferSizeInBytes));
+}
+
+template <>
+void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float)) {
+  TORCH_CUDASPARSE_CHECK(cusparseScsrgeam2(
+      handle,
+      m,
+      n,
+      alpha,
+      descrA,
+      nnzA,
+      csrSortedValA,
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      beta,
+      descrB,
+      nnzB,
+      csrSortedValB,
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      csrSortedValC,
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBuffer));
+}
+
+template <>
+void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double)) {
+  TORCH_CUDASPARSE_CHECK(cusparseDcsrgeam2(
+      handle,
+      m,
+      n,
+      alpha,
+      descrA,
+      nnzA,
+      csrSortedValA,
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      beta,
+      descrB,
+      nnzB,
+      csrSortedValB,
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      csrSortedValC,
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBuffer));
+}
+
+template <>
+void csrgeam2<c10::complex<float>>(
+    CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>)) {
+  TORCH_CUDASPARSE_CHECK(cusparseCcsrgeam2(
+      handle,
+      m,
+      n,
+      reinterpret_cast<const cuComplex*>(alpha),
+      descrA,
+      nnzA,
+      reinterpret_cast<const cuComplex*>(csrSortedValA),
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      reinterpret_cast<const cuComplex*>(beta),
+      descrB,
+      nnzB,
+      reinterpret_cast<const cuComplex*>(csrSortedValB),
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      reinterpret_cast<cuComplex*>(csrSortedValC),
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBuffer));
+}
+
+template <>
+void csrgeam2<c10::complex<double>>(
+    CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>)) {
+  TORCH_CUDASPARSE_CHECK(cusparseZcsrgeam2(
+      handle,
+      m,
+      n,
+      reinterpret_cast<const cuDoubleComplex*>(alpha),
+      descrA,
+      nnzA,
+      reinterpret_cast<const cuDoubleComplex*>(csrSortedValA),
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      reinterpret_cast<const cuDoubleComplex*>(beta),
+      descrB,
+      nnzB,
+      reinterpret_cast<const cuDoubleComplex*>(csrSortedValB),
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      reinterpret_cast<cuDoubleComplex*>(csrSortedValC),
+      csrSortedRowPtrC,
+      csrSortedColIndC,
+      pBuffer));
+}
+
+} // namespace sparse
+} // namespace cuda
+} // namespace at
--- a/aten/src/ATen/cuda/CUDASparseBlas.h
+++ b/aten/src/ATen/cuda/CUDASparseBlas.h
@ -0,0 +1,110 @@
+#pragma once
+
+/*
+  Provides a subset of cuSPARSE functions as templates:
+
+    csrgeam2<scalar_t>(...)
+
+  where scalar_t is double, float, c10::complex<double> or c10::complex<float>.
+  The functions are available in at::cuda::sparse namespace.
+*/
+
+#include <ATen/cuda/CUDAContext.h>
+
+namespace at {
+namespace cuda {
+namespace sparse {
+
+#define CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)             \
+  cusparseHandle_t handle, int m, int n, const scalar_t *alpha,     \
+      const cusparseMatDescr_t descrA, int nnzA,                    \
+      const scalar_t *csrSortedValA, const int *csrSortedRowPtrA,   \
+      const int *csrSortedColIndA, const scalar_t *beta,            \
+      const cusparseMatDescr_t descrB, int nnzB,                    \
+      const scalar_t *csrSortedValB, const int *csrSortedRowPtrB,   \
+      const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
+      const scalar_t *csrSortedValC, const int *csrSortedRowPtrC,   \
+      const int *csrSortedColIndC, size_t *pBufferSizeInBytes
+
+template <typename scalar_t>
+inline void csrgeam2_bufferSizeExt(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::csrgeam2_bufferSizeExt: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void csrgeam2_bufferSizeExt<float>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float));
+template <>
+void csrgeam2_bufferSizeExt<double>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double));
+template <>
+void csrgeam2_bufferSizeExt<c10::complex<float>>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>));
+template <>
+void csrgeam2_bufferSizeExt<c10::complex<double>>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()                                      \
+  cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,     \
+      int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,     \
+      const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB, \
+      const int *csrSortedColIndB, const cusparseMatDescr_t descrC,           \
+      int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace
+
+template <typename scalar_t>
+inline void csrgeam2Nnz(CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()) {
+  TORCH_CUDASPARSE_CHECK(cusparseXcsrgeam2Nnz(
+      handle,
+      m,
+      n,
+      descrA,
+      nnzA,
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      descrB,
+      nnzB,
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      csrSortedRowPtrC,
+      nnzTotalDevHostPtr,
+      workspace));
+}
+
+#define CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)                                 \
+  cusparseHandle_t handle, int m, int n, const scalar_t *alpha,              \
+      const cusparseMatDescr_t descrA, int nnzA,                             \
+      const scalar_t *csrSortedValA, const int *csrSortedRowPtrA,            \
+      const int *csrSortedColIndA, const scalar_t *beta,                     \
+      const cusparseMatDescr_t descrB, int nnzB,                             \
+      const scalar_t *csrSortedValB, const int *csrSortedRowPtrB,            \
+      const int *csrSortedColIndB, const cusparseMatDescr_t descrC,          \
+      scalar_t *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC, \
+      void *pBuffer
+
+template <typename scalar_t>
+inline void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::csrgeam2: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float));
+template <>
+void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double));
+template <>
+void csrgeam2<c10::complex<float>>(
+    CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>));
+template <>
+void csrgeam2<c10::complex<double>>(
+    CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>));
+
+} // namespace sparse
+} // namespace cuda
+} // namespace at
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@ -5,12 +5,12 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>

-#if AT_USE_CUSPARSE_GENERIC_API()
-
 namespace at {
 namespace cuda {
 namespace sparse {

+#if AT_USE_CUSPARSE_GENERIC_API()
+
 namespace {

 // If a specific GPU model does not provide native support for a given data
@ -93,7 +93,8 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) {

 CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {
  // cuSPARSE doesn't support batched vectors
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 1);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      input.dim() == 1 || (input.dim() == 2 && input.size(-1) == 1));

  // cuSPARSE doesn't support non-contiguous vectors
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_contiguous());
@ -149,8 +150,8 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) {
  descriptor_.reset(raw_descriptor);
 }

+#endif // AT_USE_CUSPARSE_GENERIC_API()
+
 } // namespace sparse
 } // namespace cuda
 } // namespace at
-
-#endif // AT_USE_CUSPARSE_GENERIC_API()
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@ -6,14 +6,14 @@

 #include <c10/core/ScalarType.h>

-#if AT_USE_CUSPARSE_GENERIC_API()
+#if defined(USE_ROCM)
+#include <type_traits>
+#endif

 namespace at {
 namespace cuda {
 namespace sparse {

-cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
-
 template <typename T, cusparseStatus_t (*destructor)(T*)>
 struct CuSparseDescriptorDeleter {
  void operator()(T* x) {
@ -37,6 +37,25 @@ class CuSparseDescriptor {
  std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };

+#if defined(USE_ROCM)
+// hipSPARSE doesn't define this
+using cusparseMatDescr = std::remove_pointer<cusparseMatDescr_t>::type;
+#endif
+
+class TORCH_CUDA_CPP_API CuSparseMatDescriptor
+    : public CuSparseDescriptor<cusparseMatDescr, &cusparseDestroyMatDescr> {
+ public:
+  CuSparseMatDescriptor() {
+    cusparseMatDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseCreateMatDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+#if AT_USE_CUSPARSE_GENERIC_API()
+
+cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
+
 class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
    : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
 public:
@ -83,8 +102,54 @@ class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
        values.data_ptr()));
  }
 #endif
+
+#if AT_USE_CUSPARSE_GENERIC_SPSV()
+  void set_mat_fill_mode(bool upper) {
+    cusparseFillMode_t fill_mode =
+        upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER;
+    TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
+        this->descriptor(),
+        CUSPARSE_SPMAT_FILL_MODE,
+        &fill_mode,
+        sizeof(fill_mode)));
+  }
+
+  void set_mat_diag_type(bool unit) {
+    cusparseDiagType_t diag_type =
+        unit ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT;
+    TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
+        this->descriptor(),
+        CUSPARSE_SPMAT_DIAG_TYPE,
+        &diag_type,
+        sizeof(diag_type)));
+  }
+#endif
 };

+#if AT_USE_CUSPARSE_GENERIC_SPSV()
+class TORCH_CUDA_CPP_API CuSparseSpSVDescriptor
+    : public CuSparseDescriptor<cusparseSpSVDescr, &cusparseSpSV_destroyDescr> {
+ public:
+  CuSparseSpSVDescriptor() {
+    cusparseSpSVDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseSpSV_createDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+#endif
+
+#if AT_USE_CUSPARSE_GENERIC_SPSM()
+class TORCH_CUDA_CPP_API CuSparseSpSMDescriptor
+    : public CuSparseDescriptor<cusparseSpSMDescr, &cusparseSpSM_destroyDescr> {
+ public:
+  CuSparseSpSMDescriptor() {
+    cusparseSpSMDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseSpSM_createDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+#endif
+
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
    : public CuSparseDescriptor<cusparseSpGEMMDescr, &cusparseSpGEMM_destroyDescr> {
@ -97,8 +162,8 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
 };
 #endif

+#endif // AT_USE_CUSPARSE_GENERIC_API()
+
 } // namespace sparse
 } // namespace cuda
 } // namespace at
-
-#endif // AT_USE_CUSPARSE_GENERIC_API()
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/CPUApplyUtils.h>
 #include <ATen/Dispatch.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/ExpandUtils.h>

@ -203,6 +204,45 @@ extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n,
 #endif

 namespace at {
+namespace meta {
+
+TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular) {
+  TORCH_CHECK(self.dim() >= 2,
+           "torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
+  TORCH_CHECK(A.dim() >= 2,
+           "torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");
+
+  at::native::linearSolveCheckInputs(self, A, "triangular_solve");
+
+  if (A.layout() == Layout::Strided) {
+    std::vector<int64_t> self_broadcast_size, A_broadcast_size;
+    std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A);
+
+    auto ndim = self_broadcast_size.size();
+    auto nrows = A.size(-2);
+
+    // make column major strides for BLAS
+    auto solution_strides = at::detail::defaultStrides(self_broadcast_size);
+    solution_strides[ndim - 2] = 1;
+    solution_strides[ndim - 1] = nrows;
+    set_output(0, self_broadcast_size, solution_strides, self.options(), {});
+
+    // make column major strides for BLAS
+    auto clone_A_strides = at::detail::defaultStrides(A_broadcast_size);
+    clone_A_strides[ndim - 2] = 1;
+    clone_A_strides[ndim - 1] = nrows;
+    set_output(1, A_broadcast_size, clone_A_strides, A.options(), {});
+  } else if (A.layout() == Layout::SparseCsr) {
+    // no broadcasting for non-strided layout
+    set_output(0, self.sizes(), {}, self.options(), {}); // make row major strides for Sparse BLAS
+    set_output(1, {0}, {}, self.options(), {}); // return 0-sized tensor
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "triangular_solve: Got an unexpected layout.");
+  }
+}
+
+} // namespace meta
+
 namespace native {

 #if AT_BUILD_WITH_LAPACK()
@ -1593,31 +1633,25 @@ The result of the computation is saved in-place in 'result' tensor,
 'unitriangular' if true then the diagonal elements of 'input' are assumed to be 1
 and the actual diagonal values are not used.
 */
-static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
-    Tensor& result,
-    Tensor& clone_input,
-    Tensor& infos,
+static void triangular_solve_out_impl(
+    const Tensor& result,
+    const Tensor& clone_input,
    const Tensor& input,
    const Tensor& other,
    bool upper, bool transpose, bool unitriangular) {
  // These internal asserts make explicit the assumptions in the implementation
  // Error check with the actual error messages are done on the higher level of
  // the hierarchy of calls
-  TORCH_INTERNAL_ASSERT(input.dim() >= 2);
-  TORCH_INTERNAL_ASSERT(input.size(-2) == input.size(-1));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.size(-2) == input.size(-1));

-  TORCH_INTERNAL_ASSERT(input.device() == other.device());
-  TORCH_INTERNAL_ASSERT(input.device() == result.device());
-  TORCH_INTERNAL_ASSERT(input.device() == clone_input.device());
-  TORCH_INTERNAL_ASSERT(input.device() == infos.device());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == other.device());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == result.device());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == clone_input.device());

-  TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
-  TORCH_INTERNAL_ASSERT(input.scalar_type() == result.scalar_type());
-  TORCH_INTERNAL_ASSERT(input.scalar_type() == clone_input.scalar_type());
-
-  TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
-  TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
-  TORCH_INTERNAL_ASSERT(infos.is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == other.scalar_type());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == result.scalar_type());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == clone_input.scalar_type());

  // if 'result' has no elements we can modify it
  if (result.numel() == 0) {
@ -1632,60 +1666,38 @@ static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
  }

  // 'result' and 'clone_input' must be in batched column major order (Fortran contiguous)
-  TORCH_INTERNAL_ASSERT(result.mT().is_contiguous());
-  TORCH_INTERNAL_ASSERT(clone_input.mT().is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.mT().is_contiguous());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(clone_input.mT().is_contiguous());

  // triangular_solve_stub performs calculations in-place
  // 'result' must be a copy of 'other'
  // 'clone_input' must be a copy of 'input'
-  TORCH_INTERNAL_ASSERT(result.sizes().equals(other.sizes()));
-  TORCH_INTERNAL_ASSERT(clone_input.sizes().equals(input.sizes()));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(other.sizes()));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(clone_input.sizes().equals(input.sizes()));
  result.copy_(other);
  clone_input.copy_(input);

  triangular_solve_stub(input.device().type(), clone_input, result, /*left=*/true, upper, transpose ? TransposeType::Transpose : TransposeType::NoTranspose, unitriangular);
-
-  return std::tuple<Tensor&, Tensor&>(result, clone_input);
 }

-// Supports arbitrary batch dimensions for self and A
-std::tuple<Tensor, Tensor> triangular_solve(const Tensor& self, const Tensor& A,
-                                            bool upper, bool transpose, bool unitriangular) {
-  TORCH_CHECK(self.dim() >= 2,
-           "torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
-  TORCH_CHECK(A.dim() >= 2,
-           "torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");
+TORCH_IMPL_FUNC(triangular_solve_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) {
+  Tensor self_broadcast, A_broadcast;
+  std::tie(self_broadcast, A_broadcast) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");

-  Tensor self_broadcasted, A_broadcasted;
-  std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
+  bool copy_needed = !result.transpose(-2, -1).is_contiguous();
+  copy_needed |= !clone_A.transpose(-2, -1).is_contiguous();

-  Tensor result = at::empty({0}, self.options());
-  Tensor clone_A = at::empty({0}, self.options());
-  Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self_broadcasted))}, self.options().dtype(kInt));
+  if (copy_needed) {
+    Tensor result_tmp = at::empty({0}, self.options());
+    Tensor clone_A_tmp = at::empty({0}, A.options());

-  triangular_solve_out_info(result, clone_A, infos, A_broadcasted, self_broadcasted, upper, transpose, unitriangular);
+    triangular_solve_out_impl(result_tmp, clone_A_tmp, A_broadcast, self_broadcast, upper, transpose, unitriangular);

-  if (self_broadcasted.dim() > 2) {
-    batchCheckErrors(infos, "triangular_solve");
-  } else {
-    singleCheckErrors(infos.item().toInt(), "triangular_solve");
-  }
-
-  return std::tuple<Tensor, Tensor>(result, clone_A);
-}
-
-std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, Tensor& result, Tensor& clone_A) {
-  checkSameDevice("triangular_solve", result, self);
-  checkLinalgCompatibleDtype("triangular_solve", result, self);
-  checkSameDevice("triangular_solve", clone_A, self, "clone_A");
-  checkLinalgCompatibleDtype("triangular_solve", clone_A, self, "clone_A");
-  Tensor result_tmp, clone_A_tmp;
-  std::tie(result_tmp, clone_A_tmp) = at::native::triangular_solve(self, A, upper, transpose, unitriangular);
-  at::native::resize_output(result, result_tmp.sizes());
-  at::native::resize_output(clone_A, clone_A_tmp.sizes());
    result.copy_(result_tmp);
    clone_A.copy_(clone_A_tmp);
-  return std::tuple<Tensor&, Tensor&>(result, clone_A);
+  } else {
+    triangular_solve_out_impl(result, clone_A, A_broadcast, self_broadcast, upper, transpose, unitriangular);
+  }
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/aten/src/ATen/native/BatchLinearAlgebra.h
+++ b/aten/src/ATen/native/BatchLinearAlgebra.h
@ -211,8 +211,8 @@ using lstsq_fn = void (*)(
 DECLARE_DISPATCH(lstsq_fn, lstsq_stub);

 using triangular_solve_fn = void (*)(
-    Tensor& /*A*/,
-    Tensor& /*B*/,
+    const Tensor& /*A*/,
+    const Tensor& /*B*/,
    bool /*left*/,
    bool /*upper*/,
    TransposeType /*transpose*/,
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -795,7 +795,7 @@ This is an in-place routine, content of 'B' is overwritten.
 and the actual diagonal values are not used.
 */
 template<typename scalar_t>
-void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
 #if !AT_BUILD_WITH_BLAS()
  TORCH_CHECK(
      false,
@ -826,7 +826,7 @@ void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, Transpo
 #endif
 }

-void triangular_solve_kernel(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cpu", [&]{
    apply_triangular_solve<scalar_t>(A, B, left, upper, transpose, unitriangular);
  });
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -1337,14 +1337,6 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
        auto input_shape = input.sizes().slice(2);
        auto grad_output_shape = gO.sizes().slice(2);

-        if (kernel_size.size() == 1) {
-          auto expected_input_shape = (kernel_size[0] - 1) * gi_conv_params.dilation[1]
-            - 2 * gi_conv_params.padding[1]
-            + (gi_conv_params.stride[1] * (grad_output_shape[0] - 1) + 1);
-          if (expected_input_shape != input_shape[0]) {
-            gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape;
-          }
-        } else {
        for (const auto i : c10::irange(kernel_size.size())) {
          // Check if whole input has been used or not
          auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i]
@ -1354,7 +1346,6 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
            gi_conv_params.output_padding[i] = input_shape[i] - expected_input_shape;
          }
        }
-        }

        if (gO.is_cuda()) {
          gO = gO.contiguous();
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -308,9 +308,7 @@ static inline void checkAllSameDim(TensorList tensors, int64_t dim) {
  }
 }

-static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
-  linearSolveCheckInputs(arg1, arg2, name);
-
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
  // broadcast the batch dimensions of arg1 and arg2.
  IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2);
  IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2);
@ -321,6 +319,14 @@ static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tenso

  std::vector<int64_t> arg2_expand_size({expand_batch_portion});
  arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) });
+  return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size));
+}
+
+static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
+  linearSolveCheckInputs(arg1, arg2, name);
+
+  std::vector<int64_t> arg1_expand_size, arg2_expand_size;
+  std::tie(arg1_expand_size, arg2_expand_size) = at::native::_linalg_broadcast_batch_dims(arg1, arg2);

  Tensor arg1_broadcasted  = arg1.expand(arg1_expand_size);
  Tensor arg2_broadcasted = arg2.expand(arg2_expand_size);
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -116,6 +116,43 @@ static inline Tensor to_impl(
      self, dtype, layout, device, pin_memory, non_blocking, optional_memory_format);
 }

+// If input tensor is fp32, cast it to fp16, otherwise leave it alone.
+// (this is intended to be used internally by the JIT autocast implementation)
+Tensor _autocast_to_reduced_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) {
+  if (self.dtype() == at::ScalarType::Float &&
+      ((self.device().is_cuda() && cuda_enabled) ||
+      (self.device().is_cpu() && cpu_enabled))
+      ) {
+    at::ScalarType target = at::ScalarType::Undefined;
+    if (self.device().is_cuda()) {
+      target = cuda_dtype;
+    } else if (self.device().is_cpu()) {
+      target = cpu_dtype;
+    }
+
+    TORCH_INTERNAL_ASSERT(target != at::ScalarType::Undefined, "_autocast_to_reduced_precision requires legit ScalarType argument for given device");
+
+    return to_impl(
+        self, target, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
+  } else {
+    return self;
+  }
+}
+
+// If input tensor is fp16, cast it to fp32, otherwise leave it alone.
+// (this is intended to be used internally by the JIT autocast implementation)
+Tensor _autocast_to_full_precision(const Tensor& self, bool cuda_enabled, bool cpu_enabled) {
+  if (self.dtype() == at::ScalarType::Half &&
+      ((self.device().is_cuda() && cuda_enabled) ||
+      (self.device().is_cpu() && cpu_enabled))
+      ) {
+    return to_impl(
+        self, at::ScalarType::Float, c10::nullopt, c10::nullopt, c10::nullopt, false, false, c10::nullopt);
+  } else {
+    return self;
+  }
+}
+
 Tensor to(
  const Tensor& self,
    c10::optional<ScalarType> dtype,
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -2561,4 +2561,27 @@ std::vector<Tensor> unflatten_dense_tensors(const Tensor& flat, TensorList tenso
  return outputs;
 }

-}} // at::native
+at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
+    auto output = self.clone();
+    auto slice = output.slice(dim, start, end, step);
+    TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
+    slice.copy_(src);
+    return output;
+}
+at::Tensor select_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, int64_t index) {
+    auto output = self.clone();
+    auto slice = output.select(dim, index);
+    TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
+    slice.copy_(src);
+    return output;
+}
+at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64_t offset, int64_t dim1, int64_t dim2) {
+    auto output = self.clone();
+    auto slice = output.diagonal(offset, dim1, dim2);
+    TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
+    slice.copy_(src);
+    return output;
+}
+
+} // namespace native
+} // namespace at
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -2,29 +2,48 @@
 #include <ATen/CPUApplyUtils.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/native/Resize.h>
-
 #include <ATen/Parallel.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/TriangularOpsUtils.h>
 #include <c10/util/irange.h>

 namespace at {
+namespace meta {
+
+TORCH_META_FUNC(tril)(const Tensor& self, int64_t k) {
+  set_output(self.sizes(), self.options());
+}
+
+TORCH_META_FUNC(triu)(const Tensor& self, int64_t k) {
+  set_output(self.sizes(), self.options());
+}
+
+}  // namespace meta
+
 namespace native {
+namespace {

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-template <typename scalar_t, bool upper>
-static void apply_triu_tril_single(
-    scalar_t* result, scalar_t* self, bool inplace,
-    int64_t k, int64_t n, int64_t m,
-    int64_t res_row_stride, int64_t res_col_stride,
-    int64_t self_row_stride, int64_t self_col_stride) {
-
+template <typename scalar_t>
+void apply_triu_tril_single(
+    scalar_t* result,
+    scalar_t* self,
+    bool inplace,
+    int64_t k,
+    int64_t n,
+    int64_t m,
+    int64_t res_row_stride,
+    int64_t res_col_stride,
+    int64_t self_row_stride,
+    int64_t self_col_stride,
+    bool upper) {
  constexpr int64_t zero = 0;

  if (upper) {
-    at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
-      for (const auto i : c10::irange(start, end)) {
+    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+      for (int64_t i : c10::irange(start, end)) {
        for (int64_t j = 0; j < std::min(m, i + k); j++) {
          result[i * res_row_stride + j * res_col_stride] = 0;
        }
@ -36,8 +55,8 @@ static void apply_triu_tril_single(
      }
    });
  } else {
-    at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
-      for (const auto i : c10::irange(start, end)) {
+    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+      for (int64_t i : c10::irange(start, end)) {
        for (int64_t j = std::max(zero, i + k + 1); j < m; j++) {
          result[i * res_row_stride + j * res_col_stride] = 0;
        }
@ -51,108 +70,101 @@ static void apply_triu_tril_single(
  }
 }

-template <typename scalar_t, bool upper>
-void apply_triu_tril(Tensor& result, const Tensor& self, bool inplace, int64_t k) {
+template <typename scalar_t>
+void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int64_t k, bool upper) {
  auto n = self.size(-2);
  auto m = self.size(-1);
  auto self_data = self.data_ptr<scalar_t>();
  auto self_stride = (self.dim() > 2 && self.stride(-3) > 0) ? self.stride(-3) : 1;
  auto batchsize = batchCountTrilTriu(result);
  auto self_row_stride = self.stride(-2);
-  auto self_column_stride = self.stride(-1);
+  auto self_col_stride = self.stride(-1);

  auto result_data = result.data_ptr<scalar_t>();
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t result_stride, result_row_stride, result_column_stride;
+  int64_t result_stride, result_row_stride, result_col_stride;
  if (result_data != self_data) {
    result_stride = (result.dim() > 2 && result.stride(-3) > 0) ? result.stride(-3) : 1;
    result_row_stride = result.stride(-2);
-    result_column_stride = result.stride(-1);
+    result_col_stride = result.stride(-1);
  } else {
    result_stride = self_stride;
    result_row_stride = self_row_stride;
-    result_column_stride = self_column_stride;
+    result_col_stride = self_col_stride;
  }

-  at::parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
+  parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
    for (const auto b : c10::irange(start, end)) {
      scalar_t* self_batch = &self_data[b * self_stride];
      scalar_t* result_batch = &result_data[b * result_stride];
-      apply_triu_tril_single<scalar_t, upper>(
-          result_batch, self_batch, inplace, k, n, m,
-          result_row_stride, result_column_stride, self_row_stride, self_column_stride);
+      apply_triu_tril_single<scalar_t>(
+          result_batch,
+          self_batch,
+          inplace,
+          k,
+          n,
+          m,
+          result_row_stride,
+          result_col_stride,
+          self_row_stride,
+          self_col_stride,
+          upper);
    }
  });
 }

-Tensor tril(const Tensor& self, int64_t k) {
-  Tensor result = at::empty({0}, self.options());
-  at::tril_out(result, self, k);
-  return result;
-}
+struct UpperTriangle {
+  static constexpr const char* op_name = "triu";
+  static constexpr bool upper = true;
+};

-Tensor& tril_cpu_(Tensor &self, int64_t k) {
+struct LowerTriangle {
+  static constexpr const char *op_name = "tril";
+  static constexpr bool upper = false;
+};
+
+template <typename Triangle>
+void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
  if (self.numel() == 0) {
-    return self;
+    return;
  }
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  bool inplace;
+
+  bool inplace_op = self.is_same(result);
+
+  bool inplace_update = false;
  Tensor self_c;
-  std::tie(inplace, self_c) = checkTrilTriuBatchContiguous(self, true);
-  Tensor result = inplace ? self : at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "tril", [&]{
-    apply_triu_tril<scalar_t, false>(result, self_c, inplace, k);
+  std::tie(inplace_update, self_c) = checkTrilTriuBatchContiguous(self, inplace_op);
+
+  Tensor result_c;
+  if (inplace_op && !inplace_update) {
+    result_c = at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  } else {
+    result_c = result;
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::BFloat16,
+      ScalarType::Half,
+      ScalarType::Bool,
+      self.scalar_type(),
+      Triangle::op_name,
+      [&]{
+        apply_triu_tril<scalar_t>(result_c, self_c, inplace_op && inplace_update, k, Triangle::upper);
      });
-  if (!inplace) self.copy_(result);
-  return self;
+
+  if (inplace_op && !inplace_update) {
+    result.copy_(result_c);
+  }
 }

-Tensor& tril_cpu_out(const Tensor& self, int64_t k, Tensor &result) {
-  at::native::resize_output(result, self.sizes());
-  if (self.numel() == 0) {
-    return result;
-  }
-  Tensor self_c;
-  std::tie(std::ignore, self_c) = checkTrilTriuBatchContiguous(self, false);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "tril", [&]{
-    apply_triu_tril<scalar_t, false>(result, self_c, false, k);
-  });
-  return result;
+}  // namespace
+
+TORCH_IMPL_FUNC(tril_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
+  compute_triu_tril<LowerTriangle>(self, k, result);
 }

-Tensor triu(const Tensor& self, int64_t k) {
-  Tensor result = at::empty({0}, self.options());
-  at::triu_out(result, self, k);
-  return result;
-}
-
-Tensor& triu_cpu_(Tensor &self, int64_t k) {
-  if (self.numel() == 0) {
-    return self;
-  }
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  bool inplace;
-  Tensor self_c;
-  std::tie(inplace, self_c) = checkTrilTriuBatchContiguous(self, true);
-  Tensor result = inplace ? self : at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu", [&]{
-    apply_triu_tril<scalar_t, true>(result, self_c, inplace, k);
-  });
-  if (!inplace) self.copy_(result);
-  return self;
-}
-
-Tensor& triu_cpu_out(const Tensor& self, int64_t k, Tensor &result) {
-  at::native::resize_output(result, self.sizes());
-  if (self.numel() == 0) {
-    return result;
-  }
-  Tensor self_c;
-  std::tie(std::ignore, self_c) = checkTrilTriuBatchContiguous(self, false);
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu", [&]{
-    apply_triu_tril<scalar_t, true>(result, self_c, false, k);
-  });
-  return result;
+TORCH_IMPL_FUNC(triu_cpu)(const Tensor& self, int64_t k, const Tensor &result) {
+  compute_triu_tril<UpperTriangle>(self, k, result);
 }

 Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
@ -166,6 +178,5 @@ Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) {
  return grad_input.view(sizes);
 }

-
 }  // namespace native
 }  // namespace at
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
@ -1965,7 +1965,7 @@ REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template <typename scalar_t>
-static void apply_triangular_solve_batched_magma(Tensor& A, Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+static void apply_triangular_solve_batched_magma(const Tensor& A, const Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
 #if !AT_MAGMA_ENABLED()
 AT_ERROR("triangular_solve: MAGMA library not found in "
         "compilation. Please rebuild with MAGMA.");
@ -2030,13 +2030,13 @@ AT_ERROR("triangular_solve: MAGMA library not found in "
 #endif
 }

-void triangular_solve_batched_magma(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+void triangular_solve_batched_magma(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
    apply_triangular_solve_batched_magma<scalar_t>(A, B, left, upper, transpose, unitriangular);
  });
 }

-void triangular_solve_kernel(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  // For batches smaller than 8 and matrix sizes larger than 64x64 cuBLAS forloop is faster than batched version
  if (batchCount(A) <= 8 && A.size(-1) >= 64) {
    triangular_solve_cublas(A, B, left, upper, transpose, unitriangular);
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
@ -111,7 +111,7 @@ void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pi
 }

 template <typename scalar_t>
-static void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+static void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  const auto trans = to_cublas(transpose);
  cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
@ -138,14 +138,14 @@ static void apply_triangular_solve(Tensor& A, Tensor& B, bool left, bool upper,
  }
 }

-void triangular_solve_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
    apply_triangular_solve<scalar_t>(A, B, left, upper, transpose, unitriangular);
  });
 }

 template <typename scalar_t>
-static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  const auto trans = to_cublas(transpose);
  cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
@ -170,7 +170,7 @@ static void apply_triangular_solve_batched(Tensor& A, Tensor& B, bool left, bool
  at::cuda::blas::trsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A_ptr_array_data, lda, B_ptr_array_data, ldb, batch_size);
 }

-void triangular_solve_batched_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
    apply_triangular_solve_batched<scalar_t>(A, B, left, upper, transpose, unitriangular);
  });
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
@ -35,8 +35,8 @@ namespace at {
 namespace native {

 void geqrf_batched_cublas(const Tensor& input, const Tensor& tau);
-void triangular_solve_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
-void triangular_solve_batched_cublas(Tensor& A, Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
+void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
+void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
 void gels_batched_cublas(const Tensor& a, Tensor& b, Tensor& infos);
 void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType transpose);

--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@ -4,9 +4,10 @@
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAMathCompat.h>
-#include <ATen/native/cuda/Loops.cuh>
+#include <c10/util/TypeSafeSignMath.h>

 #include <type_traits>

@ -98,7 +99,7 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) {
  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cuda", [&]() {
      gpu_kernel_with_scalars(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
-        if (!std::is_unsigned<scalar_t>::value && (a < 0) != (b < 0)) {
+        if (c10::signs_differ(a, b)) {
          // Subtracts one from the results of truncation division if the
          // divisor and dividend have different sign(bit)s and the remainder of
          // the division is nonzero
--- a/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu
@ -4,6 +4,7 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/TensorIterator.h>
+#include <c10/util/TypeSafeSignMath.h>

 #include <type_traits>

@ -17,7 +18,7 @@ void remainder_kernel_cuda(TensorIteratorBase& iter) {
    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "remainder_cuda", [&]() {
      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
        scalar_t r = a % b;
-        if (!std::is_unsigned<scalar_t>::value && (r != 0) && ((r < 0) != (b < 0))) {
+        if (r != 0 && c10::signs_differ(r, b)) {
          r += b;
        }
        return r;
@ -28,7 +29,7 @@ void remainder_kernel_cuda(TensorIteratorBase& iter) {
      gpu_kernel_with_scalars(iter,
        []GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
          auto mod = ::fmod(a, b);
-          if (!std::is_unsigned<scalar_t>::value && (mod != 0) && ((b < 0) != (mod < 0))) {
+          if (mod != 0 && c10::signs_differ(b, mod)) {
            mod += b;
          }
          return mod;
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@ -55,7 +55,7 @@ __global__ void triu_tril_kernel(
 }

 template <bool upper>
-Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, const char* name) {
+void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
  int64_t N = self.numel();
  dim3 dim_block = cuda::getApplyBlock();
  dim3 dim_grid((N + dim_block.x - 1) / dim_block.x);
@ -76,31 +76,18 @@ Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, c
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    }
  });
-  return result;
 }

-Tensor& tril_cuda_(Tensor &self, int64_t k) {
-  return tril_cuda_out(self, k, self);
-}
-
-Tensor& tril_cuda_out(const Tensor& self, int64_t k, Tensor &result) {
-  at::native::resize_output(result, self.sizes());
-  if (self.numel() == 0) {
-    return result;
+TORCH_IMPL_FUNC(tril_cuda)(const Tensor& self, int64_t k, const Tensor &result) {
+  if (self.numel() != 0) {
+    triu_tril_cuda_template<false>(result, self, k, "tril");
  }
-  return triu_tril_cuda_template<false>(result, self, k, "tril");
 }

-Tensor& triu_cuda_(Tensor &self, int64_t k) {
-  return triu_cuda_out(self, k, self);
-}
-
-Tensor& triu_cuda_out(const Tensor& self, int64_t k, Tensor &result) {
-  at::native::resize_output(result, self.sizes());
-  if (self.numel() == 0) {
-    return result;
+TORCH_IMPL_FUNC(triu_cuda)(const Tensor& self, int64_t k, const Tensor &result) {
+  if (self.numel() != 0) {
+    triu_tril_cuda_template<true>(result, self, k, "triu");
  }
-  return triu_tril_cuda_template<true>(result, self, k, "triu");
 }

 // Copy the kth diagonal of a matrix B to a vector A.
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@ -154,12 +154,6 @@ void nan_to_num_kernel_cuda(
 }

 void frexp_kernel_cuda(TensorIteratorBase& iter) {
-#if defined(USE_ROCM)
-  // Reference: https://rocmdocs.amd.com/en/latest/ROCm_API_References/HIP-MATH.html
-  //            https://github.com/ROCm-Developer-Tools/HIP/issues/2169
-  // ROCm does not support frexp function yet
-  TORCH_CHECK(false, "torch.frexp() is not implemented on ROCm platform.");
-#else
  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half,
    // The iter.dtype() here is the dtype of mantissa output.
    // It's a floating point type and must be the same as the input's dtype.
@ -171,7 +165,6 @@ void frexp_kernel_cuda(TensorIteratorBase& iter) {
        return {mantissa, exponent};
      });
  });
-#endif
 }

 REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel_cuda);
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@ -6,6 +6,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Math.cuh>
+#include <c10/util/TypeSafeSignMath.h>

 #include <type_traits>

@ -38,8 +39,7 @@ void sign_kernel_cuda(TensorIteratorBase& iter){
  } else {
    AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "sign_cuda", [&]() {
        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-            scalar_t zero = scalar_t(0);
-            return (zero < a) - (a < zero);
+            return c10::signum(a);
        });
    });
  }
@ -47,7 +47,7 @@ void sign_kernel_cuda(TensorIteratorBase& iter){

 void signbit_kernel_cuda(TensorIteratorBase& iter){
  AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !std::is_unsigned<scalar_t>::value && a < 0; });
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return is_negative(a); });
  });
 }

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -645,6 +645,7 @@
  variants: function, method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: as_strided_

@ -3126,7 +3127,7 @@
 - func: mv(Tensor self, Tensor vec) -> Tensor
  variants: function, method
  dispatch:
-    CPU, CUDA, SparseCsrCUDA: mv
+    CompositeExplicitAutograd: mv
    SparseCPU, SparseCUDA, SparseCsrCPU: mv_sparse

 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
@ -3882,6 +3883,7 @@
 # this. If this `Variable` is a view, throws an `std::runtime_error()`.
 - func: detach_(Tensor(a!) self) -> Tensor(a!)
  variants: function, method
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: detach_

@ -3910,6 +3912,27 @@
  dispatch:
    CompositeExplicitAutograd: slice_backward

+- func: slice_scatter(Tensor self, Tensor src, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice_scatter
+
+- func: select_scatter(Tensor self, Tensor src, int dim, int index) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: select_scatter
+
+- func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: diagonal_scatter
+
 - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
  variants: function, method
  dispatch:
@ -4014,6 +4037,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: squeeze_

@ -4021,6 +4045,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: squeeze_

@ -4028,6 +4053,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view

 - func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
  variants: function, method
@ -4259,6 +4285,7 @@
  device_check: NoCheck
  device_guard: False
  variants: method
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: t_

@ -4365,6 +4392,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: transpose_

@ -4501,6 +4529,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view
  dispatch:
    CompositeExplicitAutograd: unsqueeze_

@ -5303,7 +5332,7 @@
 - func: dequantize.self(Tensor self) -> Tensor
  variants: function, method
  dispatch:
-    CPU: dequantize_cpu
+    CPU, CUDA: dequantize_cpu_or_cuda
    QuantizedCPU, QuantizedCUDA: dequantize_quantized

 - func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
@ -5425,6 +5454,14 @@
 - func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
  variants: function

+- func: _autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
+  variants: method
+  device_guard: False
+
+- func: _autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
+  variants: method
+  device_guard: False
+
 - func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
  device_check: NoCheck
  device_guard: False
@ -6098,16 +6135,12 @@
    CPU, CUDA: bitwise_right_shift

 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: tril.out
  variants: method
-  dispatch:
-    CPU: tril_cpu_
-    CUDA: tril_cuda_

 - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+  structured_delegate: triu.out
  variants: method
-  dispatch:
-    CPU: triu_cpu_
-    CUDA: triu_cuda_

 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
@ -6221,24 +6254,24 @@
    CPU, CUDA: cross

 - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
  dispatch:
-    CPU: triu_cpu_out
-    CUDA: triu_cuda_out
+    CPU: triu_cpu
+    CUDA: triu_cuda

 - func: triu(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: triu.out
  variants: method, function
-  dispatch:
-    CompositeExplicitAutograd: triu

 - func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+  structured: True
  dispatch:
-    CPU: tril_cpu_out
-    CUDA: tril_cuda_out
+    CPU: tril_cpu
+    CUDA: tril_cuda

 - func: tril(Tensor self, int diagonal=0) -> Tensor
+  structured_delegate: tril.out
  variants: method, function
-  dispatch:
-    CompositeExplicitAutograd: tril

 - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
@ -6733,13 +6766,14 @@
    CUDA: legacy_lstsq_cuda

 - func: triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
+  structured: True
  dispatch:
    CPU, CUDA: triangular_solve_out
+    SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda

 - func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
+  structured_delegate: triangular_solve.X
  variants: method, function
-  dispatch:
-    CPU, CUDA: triangular_solve

 - func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
  dispatch:
@ -6786,6 +6820,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view

 # swapdims, alias for transpose
 - func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
@ -6797,6 +6832,7 @@
  variants: method
  device_check: NoCheck
  device_guard: False
+  tags: inplace_view

 - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@ -54,7 +54,8 @@ Tensor quantize_per_channel(
  auto quantizer = make_per_channel_affine_quantizer(scales, zero_points, axis, dtype);
  return quantizer->quantize(self);
 }
-Tensor dequantize_cpu(const Tensor& self) {
+
+Tensor dequantize_cpu_or_cuda(const Tensor& self) {
  TORCH_CHECK(!self.is_quantized());
  return self.to(at::kFloat);
 }
--- a/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp
@ -349,10 +349,8 @@ Tensor qnnpack_avg_pool2d(
  pytorch_qnnp_operator_t qnnpack_operator{nullptr};
  const pytorch_qnnp_status createStatus =
      pytorch_qnnp_create_average_pooling2d_nhwc_q8(
-          padH /* input_padding_top */,
-          padW /* input_padding_right */,
-          padH /* input_padding_bottom */,
-          padW /* input_padding_left */,
+          padH /* input_padding_height */,
+          padW /* input_padding_width */,
          kH /* kernel height */,
          kW /* kernel width */,
          dH /* stride height */,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/convolution.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/convolution.cc
@ -41,18 +41,13 @@ static void convolution_q8(benchmark::State& state, const char* net, bool per_ch
  const size_t inputPixelStride = groups * groupInputChannels;
  const size_t effectiveKernelHeight = (kernelHeight - 1) * dilation + 1;
  const size_t effectiveKernelWidth = (kernelWidth - 1) * dilation + 1;
-  const size_t paddingLeft = effectiveKernelWidth / 2;
-  const size_t paddingTop = effectiveKernelHeight / 2;
-  const size_t paddingRight = effectiveKernelWidth - 1 - paddingLeft;
-  const size_t paddingBottom = effectiveKernelHeight - 1 - paddingTop;
+  const size_t paddingWidth = effectiveKernelWidth / 2;
+  const size_t paddingHeight = effectiveKernelHeight / 2;
  const size_t outputHeight =
-      (paddingTop + inputHeight + paddingBottom - effectiveKernelHeight) /
-          subsampling +
+      (inputHeight + paddingHeight * 2 - effectiveKernelHeight) / subsampling +
      1;
  const size_t outputWidth =
-      (paddingLeft + inputWidth + paddingRight - effectiveKernelWidth) /
-          subsampling +
-      1;
+      (inputWidth + paddingWidth * 2 - effectiveKernelWidth) / subsampling + 1;

  std::vector<uint8_t> input(
      batchSize * inputHeight * inputWidth * inputPixelStride);
@ -78,10 +73,8 @@ static void convolution_q8(benchmark::State& state, const char* net, bool per_ch
  std::vector<float> requantization_scale(
      num_zero_points_padded, 0.5 * 0.5 / 0.5);
  status = pytorch_qnnp_create_convolution2d_nhwc_q8(
-      paddingTop,
-      paddingRight,
-      paddingBottom,
-      paddingLeft,
+      paddingHeight,
+      paddingWidth,
      kernelHeight,
      kernelWidth,
      subsampling,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pytorch_qnnpack.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pytorch_qnnpack.h
@ -40,10 +40,8 @@ enum pytorch_qnnp_status pytorch_qnnp_deinitialize(void);
 typedef struct pytorch_qnnp_operator* pytorch_qnnp_operator_t;

 enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t kernel_height,
    uint32_t kernel_width,
    uint32_t subsampling_height,
@ -66,12 +64,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
    pytorch_qnnp_operator_t* convolution);

 enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
-    uint32_t input_padding_front,
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_back,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_depth,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t kernel_depth,
    uint32_t kernel_height,
    uint32_t kernel_width,
@ -120,10 +115,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_convolution_ndhwc_q8(
    pthreadpool_t threadpool);

 enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t adjustment_height,
    uint32_t adjustment_width,
    uint32_t kernel_height,
@ -227,10 +220,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_global_average_pooling_nwc_q8(
    size_t output_stride);

 enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t pooling_height,
    uint32_t pooling_width,
    uint32_t stride_height,
@ -257,10 +248,8 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_average_pooling2d_nhwc_q8(
    pthreadpool_t threadpool);

 enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t pooling_height,
    uint32_t pooling_width,
    uint32_t stride_height,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/average-pooling.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/average-pooling.c
@ -30,10 +30,8 @@ static inline size_t compute_output_dimension(
 }

 enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t pooling_height,
    uint32_t pooling_width,
    uint32_t stride_height,
@ -145,8 +143,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
    goto error;
  }

-  const bool any_padding = (input_padding_left | input_padding_top |
-                            input_padding_right | input_padding_bottom) != 0;
+  const bool any_padding = (input_padding_width | input_padding_height) != 0;
  const uint32_t kr = pytorch_qnnp_params.q8avgpool.kr;
  const uint32_t mr = pytorch_qnnp_params.q8avgpool.mr;
  const uint32_t qr = pytorch_qnnp_params.q8avgpool.qr;
@ -162,11 +159,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_average_pooling2d_nhwc_q8(
    average_pooling->zero_pointer = zero_buffer;
  }

-  average_pooling->input_padding_top = input_padding_top;
-  average_pooling->input_padding_right = input_padding_right;
-  average_pooling->input_padding_bottom = input_padding_bottom;
-  average_pooling->input_padding_left = input_padding_left;
-
+  average_pooling->input_padding_height = input_padding_height;
+  average_pooling->input_padding_width = input_padding_width;
  average_pooling->kernel_height = pooling_height;
  average_pooling->kernel_width = pooling_width;
  average_pooling->stride_height = stride_height;
@ -239,13 +233,11 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_average_pooling2d_nhwc_q8(
  average_pooling->input_pixel_stride = input_pixel_stride;

  average_pooling->output_height = compute_output_dimension(
-      average_pooling->input_padding_top + input_height +
-          average_pooling->input_padding_bottom,
+      input_height + average_pooling->input_padding_height * 2,
      average_pooling->kernel_height,
      average_pooling->stride_height);
  average_pooling->output_width = compute_output_dimension(
-      average_pooling->input_padding_left + input_width +
-          average_pooling->input_padding_right,
+      input_width + average_pooling->input_padding_width * 2,
      average_pooling->kernel_width,
      average_pooling->stride_width);
  average_pooling->output = output;
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/convolution.c
@ -38,10 +38,8 @@ static inline size_t compute_output_dimension(
 }

 enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t kernel_height,
    uint32_t kernel_width,
    uint32_t subsampling_height,
@ -64,11 +62,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
    pytorch_qnnp_operator_t* convolution_out) {
  return pytorch_qnnp_create_convolution3d_ndhwc_q8(
      0,
-      input_padding_top,
-      input_padding_right,
-      0,
-      input_padding_bottom,
-      input_padding_left,
+      input_padding_height,
+      input_padding_width,
      1,
      kernel_height,
      kernel_width,
@ -95,12 +90,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution2d_nhwc_q8(
 }

 enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
-    uint32_t input_padding_front,
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_back,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_depth,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t kernel_depth,
    uint32_t kernel_height,
    uint32_t kernel_width,
@ -190,52 +182,43 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
        subsampling_height);
  }

-  if (input_padding_top >= kernel_height) {
+  if (input_padding_depth >= kernel_depth) {
    pytorch_qnnp_log_info(
-        "inefficiency in convolution with %" PRIu32 "x%" PRIu32
+        "inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
+        " kernel and %" PRIu32 "+%" PRIu32
+        " depth padding: "
+        "input depth padding is greater or equal to kernel depth",
+        kernel_depth,
+        kernel_height,
+        kernel_width,
+        input_padding_depth,
+        input_padding_depth);
+  }
+
+  if (input_padding_height >= kernel_height) {
+    pytorch_qnnp_log_info(
+        "inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
        " kernel and %" PRIu32 "+%" PRIu32
        " height padding: "
-        "input top padding is greater or equal to kernel height",
-        kernel_width,
+        "input height padding is greater or equal to kernel height",
+        kernel_depth,
        kernel_height,
-        input_padding_top,
-        input_padding_bottom);
+        kernel_width,
+        input_padding_height,
+        input_padding_height);
  }

-  if (input_padding_bottom >= kernel_height) {
+  if (input_padding_width >= kernel_width) {
    pytorch_qnnp_log_info(
-        "inefficiency in convolution with %" PRIu32 "x%" PRIu32
-        " kernel and %" PRIu32 "+%" PRIu32
-        " height padding: "
-        "input bottom padding is greater or equal to kernel height",
-        kernel_width,
-        kernel_height,
-        input_padding_top,
-        input_padding_bottom);
-  }
-
-  if (input_padding_right >= kernel_width) {
-    pytorch_qnnp_log_info(
-        "inefficiency in convolution with %" PRIu32 "x%" PRIu32
+        "inefficiency in convolution with %" PRIu32 "x%" PRIu32 "x%" PRIu32
        " kernel and %" PRIu32 "+%" PRIu32
        " width padding: "
-        "input right padding is greater or equal to kernel width",
-        kernel_width,
+        "input width padding is greater or equal to kernel width",
+        kernel_depth,
        kernel_height,
-        input_padding_left,
-        input_padding_right);
-  }
-
-  if (input_padding_left >= kernel_width) {
-    pytorch_qnnp_log_info(
-        "inefficiency in convolution with %" PRIu32 "x%" PRIu32
-        " kernel and %" PRIu32 "+%" PRIu32
-        " width padding: "
-        "input left padding is greater or equal to kernel width",
        kernel_width,
-        kernel_height,
-        input_padding_left,
-        input_padding_right);
+        input_padding_width,
+        input_padding_width);
  }

  for (int i = 0; i < groups * group_output_channels; ++i) {
@ -262,8 +245,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(

  enum pytorch_qnnp_ukernel_type ukernel_type = pytorch_qnnp_ukernel_type_none;
  const bool any_padding =
-      (input_padding_front | input_padding_left | input_padding_top |
-       input_padding_back | input_padding_right | input_padding_bottom) != 0;
+      (input_padding_depth | input_padding_height | input_padding_width) != 0;
  if ((kernel_size == 9 || kernel_size == 25) && group_input_channels == 1 &&
      group_output_channels == 1 && groups > 1) {
    ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
@ -503,13 +485,9 @@ enum pytorch_qnnp_status pytorch_qnnp_create_convolution3d_ndhwc_q8(
    convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
  }

-  convolution->input_padding_front = input_padding_front;
-  convolution->input_padding_top = input_padding_top;
-  convolution->input_padding_right = input_padding_right;
-  convolution->input_padding_back = input_padding_back;
-  convolution->input_padding_bottom = input_padding_bottom;
-  convolution->input_padding_left = input_padding_left;
-
+  convolution->input_padding_depth = input_padding_depth;
+  convolution->input_padding_height = input_padding_height;
+  convolution->input_padding_width = input_padding_width;
  convolution->kernel_depth = kernel_depth;
  convolution->kernel_height = kernel_height;
  convolution->kernel_width = kernel_width;
@ -615,20 +593,17 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_convolution_ndhwc_q8(
  convolution->input_pixel_stride = input_pixel_stride;

  convolution->output_depth = compute_output_dimension(
-      convolution->input_padding_front + input_depth +
-          convolution->input_padding_back,
+      input_depth + convolution->input_padding_depth * 2,
      convolution->kernel_depth,
      convolution->dilation_depth,
      convolution->stride_depth);
  convolution->output_height = compute_output_dimension(
-      convolution->input_padding_top + input_height +
-          convolution->input_padding_bottom,
+      input_height + convolution->input_padding_height * 2,
      convolution->kernel_height,
      convolution->dilation_height,
      convolution->stride_height);
  convolution->output_width = compute_output_dimension(
-      convolution->input_padding_left + input_width +
-          convolution->input_padding_right,
+      input_width + convolution->input_padding_width * 2,
      convolution->kernel_width,
      convolution->dilation_width,
      convolution->stride_width);
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc
@ -131,14 +131,14 @@ enum pytorch_qnnp_status qnnpackDeConv(
  // Setup the kernel
  const size_t output_width = compute_output_dimension(
      input_width,
-      deconvolution->input_padding_left + deconvolution->input_padding_right,
+      deconvolution->input_padding_width * 2,
      deconvolution->adjustment_width,
      kernel_width,
      deconvolution->dilation_width,
      deconvolution->stride_width);
  const size_t output_height = compute_output_dimension(
      input_height,
-      deconvolution->input_padding_top + deconvolution->input_padding_bottom,
+      deconvolution->input_padding_height * 2,
      deconvolution->adjustment_height,
      kernel_height,
      deconvolution->dilation_height,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconvolution.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconvolution.c
@ -36,10 +36,8 @@ static inline size_t compute_output_dimension(
 }

 enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t adjustment_height,
    uint32_t adjustment_width,
    uint32_t kernel_height,
@ -181,10 +179,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_deconvolution2d_nhwc_q8(
  deconvolution->zero_buffer = zero_buffer;
  deconvolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);

-  deconvolution->input_padding_top = input_padding_top;
-  deconvolution->input_padding_right = input_padding_right;
-  deconvolution->input_padding_bottom = input_padding_bottom;
-  deconvolution->input_padding_left = input_padding_left;
+  deconvolution->input_padding_height = input_padding_height;
+  deconvolution->input_padding_width = input_padding_width;
  deconvolution->adjustment_height = adjustment_height;
  deconvolution->adjustment_width = adjustment_width;

@ -266,8 +262,7 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
  const size_t output_height = deconvolution->output_height =
      compute_output_dimension(
          input_height,
-          deconvolution->input_padding_top +
-              deconvolution->input_padding_bottom,
+          deconvolution->input_padding_height * 2,
          deconvolution->adjustment_height,
          kernel_height,
          deconvolution->dilation_height,
@ -275,8 +270,7 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_deconvolution2d_nhwc_q8(
  const size_t output_width = deconvolution->output_width =
      compute_output_dimension(
          input_width,
-          deconvolution->input_padding_left +
-              deconvolution->input_padding_right,
+          deconvolution->input_padding_width * 2,
          deconvolution->adjustment_width,
          kernel_width,
          deconvolution->dilation_width,
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c
@ -40,9 +40,9 @@ void pytorch_qnnp_indirection_init_conv3d(
  const size_t dilation_depth = op->dilation_depth;
  const size_t dilation_height = op->dilation_height;
  const size_t dilation_width = op->dilation_width;
-  const size_t input_padding_front = op->input_padding_front;
-  const size_t input_padding_top = op->input_padding_top;
-  const size_t input_padding_left = op->input_padding_left;
+  const size_t input_padding_depth = op->input_padding_depth;
+  const size_t input_padding_height = op->input_padding_height;
+  const size_t input_padding_width = op->input_padding_width;

  const size_t output_size = output_depth * output_height * output_width;
  const size_t kernel_size = kernel_depth * kernel_height * kernel_width;
@ -70,16 +70,16 @@ void pytorch_qnnp_indirection_init_conv3d(

          for (size_t kernel_z = 0; kernel_z < kernel_depth; kernel_z++) {
            const size_t input_z = output_z * stride_depth +
-                kernel_z * dilation_depth - input_padding_front;
+                kernel_z * dilation_depth - input_padding_depth;
            if (input_z < input_depth) {
              for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
                const size_t input_y = output_y * stride_height +
-                    kernel_y * dilation_height - input_padding_top;
+                    kernel_y * dilation_height - input_padding_height;
                if (input_y < input_height) {
                  for (size_t kernel_x = 0; kernel_x < kernel_width;
                       kernel_x++) {
                    const size_t input_x = output_x * stride_width +
-                        kernel_x * dilation_width - input_padding_left;
+                        kernel_x * dilation_width - input_padding_width;
                    const size_t index = (group * batch_size + image) *
                            tiled_output_size * kernel_size +
                        output_tile_start * kernel_size +
@ -154,19 +154,19 @@ void pytorch_qnnp_indirection_init_dwconv2d(
  const size_t stride_width = op->stride_width;
  const size_t dilation_height = op->dilation_height;
  const size_t dilation_width = op->dilation_width;
-  const size_t input_padding_top = op->input_padding_top;
-  const size_t input_padding_left = op->input_padding_left;
+  const size_t input_padding_height = op->input_padding_height;
+  const size_t input_padding_width = op->input_padding_width;

  for (size_t image = batch_start; image < batch_size; image++) {
    for (size_t output_y = 0; output_y < output_height; output_y++) {
      for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
        const size_t input_y = output_y * stride_height +
-            kernel_y * dilation_height - input_padding_top;
+            kernel_y * dilation_height - input_padding_height;
        if (input_y < input_height) {
          for (size_t output_x = 0; output_x < output_width; output_x++) {
            for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
              const size_t input_x = output_x * stride_width +
-                  kernel_x * dilation_width - input_padding_left;
+                  kernel_x * dilation_width - input_padding_width;
              const size_t index =
                  (image * output_height + output_y) * step_height +
                  output_x * step_width * kernel_height +
@ -217,8 +217,8 @@ void pytorch_qnnp_indirection_init_deconv2d(
  const size_t stride_width = op->stride_width;
  const size_t dilation_height = op->dilation_height;
  const size_t dilation_width = op->dilation_width;
-  const size_t input_padding_top = op->input_padding_top;
-  const size_t input_padding_left = op->input_padding_left;
+  const size_t input_padding_height = op->input_padding_height;
+  const size_t input_padding_width = op->input_padding_width;

  const size_t output_size = output_height * output_width;
  const size_t kernel_size = kernel_height * kernel_width;
@ -237,11 +237,11 @@ void pytorch_qnnp_indirection_init_deconv2d(
          const size_t output_x = output_index % output_width;
          for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
            const size_t y =
-                output_y + input_padding_top - kernel_y * dilation_height;
+                output_y + input_padding_height - kernel_y * dilation_height;
            const size_t input_y = y / stride_height;
            for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
              const size_t x =
-                  output_x + input_padding_left - kernel_x * dilation_width;
+                  output_x + input_padding_width - kernel_x * dilation_width;
              const size_t input_x = x / stride_width;
              const size_t index = (group * batch_size + image) *
                      tiled_output_size * kernel_size +
@ -284,21 +284,21 @@ void pytorch_qnnp_indirection_init_maxpool2d(
  const size_t stride_width = op->stride_width;
  const size_t dilation_height = op->dilation_height;
  const size_t dilation_width = op->dilation_width;
-  const size_t input_padding_top = op->input_padding_top;
-  const size_t input_padding_left = op->input_padding_left;
+  const size_t input_padding_height = op->input_padding_height;
+  const size_t input_padding_width = op->input_padding_width;

  for (size_t image = batch_start; image < batch_size; image++) {
    for (size_t output_y = 0; output_y < output_height; output_y++) {
      for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
        const size_t input_y =
            doz(output_y * stride_height + pooling_y * dilation_height,
-                input_padding_top);
+                input_padding_height);
        const size_t clamped_input_y = min(input_y, input_height - 1);
        for (size_t output_x = 0; output_x < output_width; output_x++) {
          for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
            const size_t input_x =
                doz(output_x * stride_width + pooling_x * dilation_width,
-                    input_padding_left);
+                    input_padding_width);
            const size_t clamped_input_x = min(input_x, input_width - 1);
            const size_t index =
                (image * output_height + output_y) * step_height +
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/max-pooling.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/max-pooling.c
@ -35,10 +35,8 @@ static inline size_t compute_output_dimension(
 }

 enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
-    uint32_t input_padding_top,
-    uint32_t input_padding_right,
-    uint32_t input_padding_bottom,
-    uint32_t input_padding_left,
+    uint32_t input_padding_height,
+    uint32_t input_padding_width,
    uint32_t pooling_height,
    uint32_t pooling_width,
    uint32_t stride_height,
@ -117,10 +115,8 @@ enum pytorch_qnnp_status pytorch_qnnp_create_max_pooling2d_nhwc_u8(
    goto error;
  }

-  max_pooling->input_padding_top = input_padding_top;
-  max_pooling->input_padding_right = input_padding_right;
-  max_pooling->input_padding_bottom = input_padding_bottom;
-  max_pooling->input_padding_left = input_padding_left;
+  max_pooling->input_padding_height = input_padding_height;
+  max_pooling->input_padding_width = input_padding_width;

  max_pooling->kernel_height = pooling_height;
  max_pooling->kernel_width = pooling_width;
@ -180,14 +176,12 @@ enum pytorch_qnnp_status pytorch_qnnp_setup_max_pooling2d_nhwc_u8(
  max_pooling->input_pixel_stride = input_pixel_stride;

  max_pooling->output_height = compute_output_dimension(
-      max_pooling->input_padding_top + input_height +
-          max_pooling->input_padding_bottom,
+      input_height + max_pooling->input_padding_height * 2,
      max_pooling->kernel_height,
      max_pooling->dilation_height,
      max_pooling->stride_height);
  max_pooling->output_width = compute_output_dimension(
-      max_pooling->input_padding_left + input_width +
-          max_pooling->input_padding_right,
+      input_width + max_pooling->input_padding_width * 2,
      max_pooling->kernel_width,
      max_pooling->dilation_width,
      max_pooling->stride_width);
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/operator.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/operator.h
@ -47,12 +47,9 @@ typedef struct {

 struct pytorch_qnnp_operator {
  size_t batch_size;
-  uint32_t input_padding_front;
-  uint32_t input_padding_back;
-  uint32_t input_padding_top;
-  uint32_t input_padding_right;
-  uint32_t input_padding_bottom;
-  uint32_t input_padding_left;
+  uint32_t input_padding_depth;
+  uint32_t input_padding_height;
+  uint32_t input_padding_width;
  uint32_t adjustment_height;
  uint32_t adjustment_width;
  uint32_t kernel_depth;
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/average-pooling-operator-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/average-pooling-operator-tester.h
@ -22,69 +22,35 @@
 class AveragePoolingOperatorTester {
 public:
  inline AveragePoolingOperatorTester& padding(uint32_t padding) {
-    this->paddingTop_ = padding;
-    this->paddingRight_ = padding;
-    this->paddingBottom_ = padding;
-    this->paddingLeft_ = padding;
+    this->paddingHeight_ = padding;
+    this->paddingWidth_ = padding;
    return *this;
  }

  inline AveragePoolingOperatorTester& padding(
      uint32_t paddingHeight,
      uint32_t paddingWidth) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingRight_ = paddingWidth;
-    this->paddingBottom_ = paddingHeight;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingHeight_ = paddingHeight;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

  inline AveragePoolingOperatorTester& paddingHeight(uint32_t paddingHeight) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingBottom_ = paddingHeight;
+    this->paddingHeight_ = paddingHeight;
    return *this;
  }

  inline AveragePoolingOperatorTester& paddingWidth(uint32_t paddingWidth) {
-    this->paddingRight_ = paddingWidth;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

-  inline AveragePoolingOperatorTester& paddingTop(uint32_t paddingTop) {
-    this->paddingTop_ = paddingTop;
-    return *this;
+  inline uint32_t paddingHeight() const {
+    return this->paddingHeight_;
  }

-  inline uint32_t paddingTop() const {
-    return this->paddingTop_;
-  }
-
-  inline AveragePoolingOperatorTester& paddingRight(uint32_t paddingRight) {
-    this->paddingRight_ = paddingRight;
-    return *this;
-  }
-
-  inline uint32_t paddingRight() const {
-    return this->paddingRight_;
-  }
-
-  inline AveragePoolingOperatorTester& paddingBottom(uint32_t paddingBottom) {
-    this->paddingBottom_ = paddingBottom;
-    return *this;
-  }
-
-  inline uint32_t paddingBottom() const {
-    return this->paddingBottom_;
-  }
-
-  inline AveragePoolingOperatorTester& paddingLeft(uint32_t paddingLeft) {
-    this->paddingLeft_ = paddingLeft;
-    return *this;
-  }
-
-  inline uint32_t paddingLeft() const {
-    return this->paddingLeft_;
+  inline uint32_t paddingWidth() const {
+    return this->paddingWidth_;
  }

  inline AveragePoolingOperatorTester& inputSize(
@ -211,8 +177,7 @@ class AveragePoolingOperatorTester {
  }

  inline size_t outputHeight() const {
-    const size_t paddedInputHeight =
-        paddingTop() + inputHeight() + paddingBottom();
+    const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
    if (paddedInputHeight <= poolingHeight()) {
      return 1;
    } else {
@ -221,8 +186,7 @@ class AveragePoolingOperatorTester {
  }

  inline size_t outputWidth() const {
-    const size_t paddedInputWidth =
-        paddingLeft() + inputWidth() + paddingRight();
+    const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
    if (paddedInputWidth <= poolingWidth()) {
      return 1;
    } else {
@ -303,7 +267,7 @@ class AveragePoolingOperatorTester {

  inline size_t nextOutputHeight() const {
    const size_t paddedNextInputHeight =
-        paddingTop() + nextInputHeight() + paddingBottom();
+        nextInputHeight() + paddingHeight() * 2;
    if (paddedNextInputHeight <= poolingHeight()) {
      return 1;
    } else {
@ -312,8 +276,7 @@ class AveragePoolingOperatorTester {
  }

  inline size_t nextOutputWidth() const {
-    const size_t paddedNextInputWidth =
-        paddingLeft() + nextInputWidth() + paddingRight();
+    const size_t paddedNextInputWidth = nextInputWidth() + paddingWidth() * 2;
    if (paddedNextInputWidth <= poolingWidth()) {
      return 1;
    } else {
@ -430,9 +393,9 @@ class AveragePoolingOperatorTester {
            for (size_t c = 0; c < channels(); c++) {
              double acc = 0.0f;
              for (size_t py = 0; py < poolingHeight(); py++) {
-                const size_t iy = oy * strideHeight() + py - paddingTop();
+                const size_t iy = oy * strideHeight() + py - paddingHeight();
                for (size_t px = 0; px < poolingWidth(); px++) {
-                  const size_t ix = ox * strideWidth() + px - paddingLeft();
+                  const size_t ix = ox * strideWidth() + px - paddingWidth();
                  if (ix < inputWidth() && iy < inputHeight()) {
                    acc += double(
                        int32_t(input
@ -482,10 +445,8 @@ class AveragePoolingOperatorTester {
      ASSERT_EQ(
          pytorch_qnnp_status_success,
          pytorch_qnnp_create_average_pooling2d_nhwc_q8(
-              paddingTop(),
-              paddingRight(),
-              paddingBottom(),
-              paddingLeft(),
+              paddingHeight(),
+              paddingWidth(),
              poolingHeight(),
              poolingWidth(),
              strideHeight(),
@ -596,9 +557,9 @@ class AveragePoolingOperatorTester {
            for (size_t c = 0; c < channels(); c++) {
              double acc = 0.0f;
              for (size_t py = 0; py < poolingHeight(); py++) {
-                const size_t iy = oy * strideHeight() + py - paddingTop();
+                const size_t iy = oy * strideHeight() + py - paddingHeight();
                for (size_t px = 0; px < poolingWidth(); px++) {
-                  const size_t ix = ox * strideWidth() + px - paddingLeft();
+                  const size_t ix = ox * strideWidth() + px - paddingWidth();
                  if (ix < inputWidth() && iy < inputHeight()) {
                    acc += double(
                        int32_t(input
@ -648,10 +609,8 @@ class AveragePoolingOperatorTester {
      ASSERT_EQ(
          pytorch_qnnp_status_success,
          pytorch_qnnp_create_average_pooling2d_nhwc_q8(
-              paddingTop(),
-              paddingRight(),
-              paddingBottom(),
-              paddingLeft(),
+              paddingHeight(),
+              paddingWidth(),
              poolingHeight(),
              poolingWidth(),
              strideHeight(),
@ -731,9 +690,9 @@ class AveragePoolingOperatorTester {
            for (size_t c = 0; c < channels(); c++) {
              double acc = 0.0f;
              for (size_t py = 0; py < poolingHeight(); py++) {
-                const size_t iy = oy * strideHeight() + py - paddingTop();
+                const size_t iy = oy * strideHeight() + py - paddingHeight();
                for (size_t px = 0; px < poolingWidth(); px++) {
-                  const size_t ix = ox * strideWidth() + px - paddingLeft();
+                  const size_t ix = ox * strideWidth() + px - paddingWidth();
                  if (ix < nextInputWidth() && iy < nextInputHeight()) {
                    acc += double(
                        int32_t(input
@ -847,10 +806,8 @@ class AveragePoolingOperatorTester {
  }

 private:
-  uint32_t paddingTop_{0};
-  uint32_t paddingRight_{0};
-  uint32_t paddingBottom_{0};
-  uint32_t paddingLeft_{0};
+  uint32_t paddingHeight_{0};
+  uint32_t paddingWidth_{0};
  size_t inputHeight_{1};
  size_t inputWidth_{1};
  size_t channels_{1};
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/average-pooling.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/average-pooling.cc
@ -50,14 +50,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
       channels += 3) {
    for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
         poolSize++) {
-      for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
-        for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
+      for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(2)
            .inputWidth(poolSize + 2)
-              .paddingLeft(paddingLeft)
-              .paddingRight(paddingRight)
+            .paddingWidth(paddingWidth)
            .poolingHeight(1)
            .poolingWidth(poolSize)
            .channels(channels)
@ -65,7 +63,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
      }
    }
  }
-  }
 }

 TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_stride) {
@ -114,14 +111,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
       channels += 3) {
    for (size_t poolSize = 2; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
-        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(poolSize + 1)
            .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+            .paddingHeight(paddingHeight)
            .poolingHeight(poolSize)
            .poolingWidth(1)
            .channels(channels)
@ -129,7 +124,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
      }
    }
  }
-  }
 }

 TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_stride) {
@ -433,14 +427,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
       channels += 3) {
    for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
         poolSize++) {
-      for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
-        for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
+      for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(2)
            .inputWidth(poolSize + 2)
-              .paddingLeft(paddingLeft)
-              .paddingRight(paddingRight)
+            .paddingWidth(paddingWidth)
            .poolingHeight(1)
            .poolingWidth(poolSize)
            .channels(channels)
@ -448,7 +440,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
      }
    }
  }
-  }
 }

 TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_stride) {
@ -500,14 +491,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
    for (size_t poolSize = pytorch_qnnp_params.q8avgpool.mr + 1; poolSize <=
         pytorch_qnnp_params.q8avgpool.mr + pytorch_qnnp_params.q8avgpool.qr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
-        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(poolSize + 1)
            .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+            .paddingHeight(paddingHeight)
            .poolingHeight(poolSize)
            .poolingWidth(1)
            .channels(channels)
@ -515,7 +504,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
      }
    }
  }
-  }
 }

 TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
@ -526,14 +514,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
    for (size_t poolSize = pytorch_qnnp_params.q8avgpool.mr + 1; poolSize <=
         pytorch_qnnp_params.q8avgpool.mr + pytorch_qnnp_params.q8avgpool.qr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
-        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(poolSize + 1)
            .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+            .paddingHeight(paddingHeight)
            .poolingHeight(poolSize)
            .poolingWidth(1)
            .channels(channels)
@ -541,7 +527,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_stride) {
      }
    }
  }
-  }
 }

 TEST(
@ -830,14 +815,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
       channels++) {
    for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.q8avgpool.mr;
         poolSize++) {
-      for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
-        for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
+      for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(2)
            .inputWidth(poolSize + 2)
-              .paddingLeft(paddingLeft)
-              .paddingRight(paddingRight)
+            .paddingWidth(paddingWidth)
            .poolingHeight(1)
            .poolingWidth(poolSize)
            .channels(channels)
@ -845,7 +828,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
      }
    }
  }
-  }
 }

 TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_1xM_pool_with_stride) {
@ -891,14 +873,12 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
       channels++) {
    for (size_t poolSize = 2; poolSize <= 2 * pytorch_qnnp_params.q8avgpool.kr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
-        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        AveragePoolingOperatorTester()
            .batchSize(1)
            .inputHeight(poolSize + 1)
            .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+            .paddingHeight(paddingHeight)
            .poolingHeight(poolSize)
            .poolingWidth(1)
            .channels(channels)
@ -906,7 +886,6 @@ TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
      }
    }
  }
-  }
 }

 TEST(AVERAGE_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_stride) {
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/convolution-operator-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/convolution-operator-tester.h
@ -27,69 +27,35 @@ using namespace qnnpack::testing;
 class ConvolutionOperatorTester {
 public:
  inline ConvolutionOperatorTester& padding(uint32_t padding) {
-    this->paddingTop_ = padding;
-    this->paddingRight_ = padding;
-    this->paddingBottom_ = padding;
-    this->paddingLeft_ = padding;
+    this->paddingHeight_ = padding;
+    this->paddingWidth_ = padding;
    return *this;
  }

  inline ConvolutionOperatorTester& padding(
      uint32_t paddingHeight,
      uint32_t paddingWidth) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingRight_ = paddingWidth;
-    this->paddingBottom_ = paddingHeight;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingHeight_ = paddingHeight;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

  inline ConvolutionOperatorTester& paddingHeight(uint32_t paddingHeight) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingBottom_ = paddingHeight;
+    this->paddingHeight_ = paddingHeight;
    return *this;
  }

  inline ConvolutionOperatorTester& paddingWidth(uint32_t paddingWidth) {
-    this->paddingRight_ = paddingWidth;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

-  inline ConvolutionOperatorTester& paddingTop(uint32_t paddingTop) {
-    this->paddingTop_ = paddingTop;
-    return *this;
+  inline uint32_t paddingWidth() const {
+    return this->paddingWidth_;
  }

-  inline uint32_t paddingTop() const {
-    return this->paddingTop_;
-  }
-
-  inline ConvolutionOperatorTester& paddingRight(uint32_t paddingRight) {
-    this->paddingRight_ = paddingRight;
-    return *this;
-  }
-
-  inline uint32_t paddingRight() const {
-    return this->paddingRight_;
-  }
-
-  inline ConvolutionOperatorTester& paddingBottom(uint32_t paddingBottom) {
-    this->paddingBottom_ = paddingBottom;
-    return *this;
-  }
-
-  inline uint32_t paddingBottom() const {
-    return this->paddingBottom_;
-  }
-
-  inline ConvolutionOperatorTester& paddingLeft(uint32_t paddingLeft) {
-    this->paddingLeft_ = paddingLeft;
-    return *this;
-  }
-
-  inline uint32_t paddingLeft() const {
-    return this->paddingLeft_;
+  inline uint32_t paddingHeight() const {
+    return this->paddingHeight_;
  }

  inline ConvolutionOperatorTester& inputSize(
@ -325,8 +291,7 @@ class ConvolutionOperatorTester {
  }

  inline size_t outputHeight() const {
-    const size_t paddedInputHeight =
-        paddingTop() + inputHeight() + paddingBottom();
+    const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
    if (paddedInputHeight <= dilatedKernelHeight()) {
      return 1;
    } else {
@ -336,8 +301,7 @@ class ConvolutionOperatorTester {
  }

  inline size_t outputWidth() const {
-    const size_t paddedInputWidth =
-        paddingLeft() + inputWidth() + paddingRight();
+    const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
    if (paddedInputWidth <= dilatedKernelWidth()) {
      return 1;
    } else {
@ -437,11 +401,11 @@ class ConvolutionOperatorTester {
          for (size_t ox = 0; ox < outputWidth(); ox++) {
            for (size_t ky = 0; ky < kernelHeight(); ky++) {
              const size_t iy = oy * subsamplingHeight() +
-                  ky * dilationHeight() - paddingTop();
+                  ky * dilationHeight() - paddingHeight();
              if (iy < inputHeight()) {
                for (size_t kx = 0; kx < kernelWidth(); kx++) {
                  const size_t ix = ox * subsamplingWidth() +
-                      kx * dilationWidth() - paddingLeft();
+                      kx * dilationWidth() - paddingWidth();
                  if (ix < inputWidth()) {
                    for (size_t g = 0; g < groups(); g++) {
                      for (size_t oc = 0; oc < groupOutputChannels(); oc++) {
@ -515,10 +479,8 @@ class ConvolutionOperatorTester {
      ASSERT_EQ(
          pytorch_qnnp_status_success,
          pytorch_qnnp_create_convolution2d_nhwc_q8(
-              paddingTop(),
-              paddingRight(),
-              paddingBottom(),
-              paddingLeft(),
+              paddingHeight(),
+              paddingWidth(),
              kernelHeight(),
              kernelWidth(),
              subsamplingHeight(),
@ -641,10 +603,8 @@ class ConvolutionOperatorTester {
  }

 private:
-  uint32_t paddingTop_{0};
-  uint32_t paddingRight_{0};
-  uint32_t paddingBottom_{0};
-  uint32_t paddingLeft_{0};
+  uint32_t paddingHeight_{0};
+  uint32_t paddingWidth_{0};
  size_t inputHeight_{1};
  size_t inputWidth_{1};
  uint32_t groups_{1};
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/convolution.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/convolution.cc
@ -272,45 +272,27 @@ _STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_without_padding,
      .iterations(3)
 )

-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_left_padding,
+_STATIC_AND_RUNTIME_TEST(
+    CONVOLUTION_OP,
+    3x3_with_width_padding,
    ConvolutionOperatorTester()
        .inputSize(13, 12)
-      .paddingLeft(1)
+        .paddingWidth(1)
        .kernelSize(3, 3)
        .groupInputChannels(15)
        .groupOutputChannels(17)
-      .iterations(3)
-)
+        .iterations(3))

-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_right_padding,
+_STATIC_AND_RUNTIME_TEST(
+    CONVOLUTION_OP,
+    3x3_with_height_padding,
    ConvolutionOperatorTester()
        .inputSize(13, 12)
-      .paddingRight(1)
+        .paddingHeight(1)
        .kernelSize(3, 3)
        .groupInputChannels(15)
        .groupOutputChannels(17)
-      .iterations(3)
-)
-
-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_top_padding,
-  ConvolutionOperatorTester()
-      .inputSize(13, 12)
-      .paddingTop(1)
-      .kernelSize(3, 3)
-      .groupInputChannels(15)
-      .groupOutputChannels(17)
-      .iterations(3)
-)
-
-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_bottom_padding,
-  ConvolutionOperatorTester()
-      .inputSize(13, 12)
-      .paddingBottom(1)
-      .kernelSize(3, 3)
-      .groupInputChannels(15)
-      .groupOutputChannels(17)
-      .iterations(3)
-)
+        .iterations(3))

 _STATIC_TEST(CONVOLUTION_OP, 3x3_with_input_stride,
  ConvolutionOperatorTester()
@ -833,49 +815,29 @@ _STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_without_padding_per_channel,
      .per_channel(true)
 )

-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_left_padding_per_channel,
+_STATIC_AND_RUNTIME_TEST(
+    CONVOLUTION_OP,
+    3x3_with_width_padding_per_channel,
    ConvolutionOperatorTester()
        .inputSize(13, 12)
-      .paddingLeft(1)
+        .paddingWidth(1)
        .kernelSize(3, 3)
        .groupInputChannels(15)
        .groupOutputChannels(17)
        .iterations(3)
-      .per_channel(true)
-)
+        .per_channel(true))

-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_right_padding_per_channel,
+_STATIC_AND_RUNTIME_TEST(
+    CONVOLUTION_OP,
+    3x3_with_height_padding_per_channel,
    ConvolutionOperatorTester()
        .inputSize(13, 12)
-      .paddingRight(1)
+        .paddingHeight(1)
        .kernelSize(3, 3)
        .groupInputChannels(15)
        .groupOutputChannels(17)
        .iterations(3)
-      .per_channel(true)
-)
-
-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_top_padding_per_channel,
-  ConvolutionOperatorTester()
-      .inputSize(13, 12)
-      .paddingTop(1)
-      .kernelSize(3, 3)
-      .groupInputChannels(15)
-      .groupOutputChannels(17)
-      .iterations(3)
-      .per_channel(true)
-)
-
-_STATIC_AND_RUNTIME_TEST(CONVOLUTION_OP, 3x3_with_bottom_padding_per_channel,
-  ConvolutionOperatorTester()
-      .inputSize(13, 12)
-      .paddingBottom(1)
-      .kernelSize(3, 3)
-      .groupInputChannels(15)
-      .groupOutputChannels(17)
-      .iterations(3)
-      .per_channel(true)
-)
+        .per_channel(true))

 _STATIC_TEST(CONVOLUTION_OP, 3x3_with_input_stride_per_channel,
  ConvolutionOperatorTester()
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/deconvolution-operator-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/deconvolution-operator-tester.h
@ -27,77 +27,35 @@ using namespace qnnpack::testing;
 class DeconvolutionOperatorTester {
 public:
  inline DeconvolutionOperatorTester& padding(uint32_t padding) {
-    this->paddingTop_ = padding;
-    this->paddingRight_ = padding;
-    this->paddingBottom_ = padding;
-    this->paddingLeft_ = padding;
+    this->paddingHeight_ = padding;
+    this->paddingWidth_ = padding;
    return *this;
  }

  inline DeconvolutionOperatorTester& padding(
      uint32_t paddingHeight,
      uint32_t paddingWidth) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingRight_ = paddingWidth;
-    this->paddingBottom_ = paddingHeight;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingHeight_ = paddingHeight;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

  inline DeconvolutionOperatorTester& paddingHeight(uint32_t paddingHeight) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingBottom_ = paddingHeight;
+    this->paddingHeight_ = paddingHeight;
    return *this;
  }

  inline uint32_t paddingHeight() const {
-    return this->paddingTop_ + this->paddingBottom_;
+    return this->paddingHeight_;
  }

  inline DeconvolutionOperatorTester& paddingWidth(uint32_t paddingWidth) {
-    this->paddingRight_ = paddingWidth;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

  inline uint32_t paddingWidth() const {
-    return this->paddingLeft_ + this->paddingRight_;
-  }
-
-  inline DeconvolutionOperatorTester& paddingTop(uint32_t paddingTop) {
-    this->paddingTop_ = paddingTop;
-    return *this;
-  }
-
-  inline uint32_t paddingTop() const {
-    return this->paddingTop_;
-  }
-
-  inline DeconvolutionOperatorTester& paddingRight(uint32_t paddingRight) {
-    this->paddingRight_ = paddingRight;
-    return *this;
-  }
-
-  inline uint32_t paddingRight() const {
-    return this->paddingRight_;
-  }
-
-  inline DeconvolutionOperatorTester& paddingBottom(uint32_t paddingBottom) {
-    this->paddingBottom_ = paddingBottom;
-    return *this;
-  }
-
-  inline uint32_t paddingBottom() const {
-    return this->paddingBottom_;
-  }
-
-  inline DeconvolutionOperatorTester& paddingLeft(uint32_t paddingLeft) {
-    this->paddingLeft_ = paddingLeft;
-    return *this;
-  }
-
-  inline uint32_t paddingLeft() const {
-    return this->paddingLeft_;
+    return this->paddingWidth_;
  }

  inline DeconvolutionOperatorTester& adjustmentHeight(
@ -353,12 +311,12 @@ class DeconvolutionOperatorTester {

  inline size_t outputHeight() const {
    return strideHeight() * (inputHeight() - 1) + adjustmentHeight() +
-        dilatedKernelHeight() - paddingHeight();
+        dilatedKernelHeight() - paddingHeight() * 2;
  }

  inline size_t outputWidth() const {
    return strideWidth() * (inputWidth() - 1) + adjustmentWidth() +
-        dilatedKernelWidth() - paddingWidth();
+        dilatedKernelWidth() - paddingWidth() * 2;
  }

  inline DeconvolutionOperatorTester& qmin(uint8_t qmin) {
@ -451,11 +409,11 @@ class DeconvolutionOperatorTester {
        for (size_t oy = 0; oy < outputHeight(); oy++) {
          for (size_t ox = 0; ox < outputWidth(); ox++) {
            for (size_t ky = 0; ky < kernelHeight(); ky++) {
-              const size_t y = oy + paddingTop() - ky * dilationHeight();
+              const size_t y = oy + paddingHeight() - ky * dilationHeight();
              const size_t iy = y / strideHeight();
              if (iy * strideHeight() == y && iy < inputHeight()) {
                for (size_t kx = 0; kx < kernelWidth(); kx++) {
-                  const size_t x = ox + paddingLeft() - kx * dilationWidth();
+                  const size_t x = ox + paddingWidth() - kx * dilationWidth();
                  const size_t ix = x / strideWidth();
                  if (ix * strideWidth() == x && ix < inputWidth()) {
                    for (size_t g = 0; g < groups(); g++) {
@ -531,10 +489,8 @@ class DeconvolutionOperatorTester {
      ASSERT_EQ(
          pytorch_qnnp_status_success,
          pytorch_qnnp_create_deconvolution2d_nhwc_q8(
-              paddingTop(),
-              paddingRight(),
-              paddingBottom(),
-              paddingLeft(),
+              paddingHeight(),
+              paddingWidth(),
              adjustmentHeight(),
              adjustmentWidth(),
              kernelHeight(),
@ -674,10 +630,8 @@ class DeconvolutionOperatorTester {
  }

 private:
-  uint32_t paddingTop_{0};
-  uint32_t paddingRight_{0};
-  uint32_t paddingBottom_{0};
-  uint32_t paddingLeft_{0};
+  uint32_t paddingHeight_{0};
+  uint32_t paddingWidth_{0};
  size_t inputHeight_{1};
  size_t inputWidth_{1};
  uint32_t groups_{1};
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/max-pooling-operator-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/max-pooling-operator-tester.h
@ -21,69 +21,35 @@
 class MaxPoolingOperatorTester {
 public:
  inline MaxPoolingOperatorTester& padding(uint32_t padding) {
-    this->paddingTop_ = padding;
-    this->paddingRight_ = padding;
-    this->paddingBottom_ = padding;
-    this->paddingLeft_ = padding;
+    this->paddingHeight_ = padding;
+    this->paddingWidth_ = padding;
    return *this;
  }

  inline MaxPoolingOperatorTester& padding(
      uint32_t paddingHeight,
      uint32_t paddingWidth) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingRight_ = paddingWidth;
-    this->paddingBottom_ = paddingHeight;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingHeight_ = paddingHeight;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

  inline MaxPoolingOperatorTester& paddingHeight(uint32_t paddingHeight) {
-    this->paddingTop_ = paddingHeight;
-    this->paddingBottom_ = paddingHeight;
+    this->paddingHeight_ = paddingHeight;
    return *this;
  }

  inline MaxPoolingOperatorTester& paddingWidth(uint32_t paddingWidth) {
-    this->paddingRight_ = paddingWidth;
-    this->paddingLeft_ = paddingWidth;
+    this->paddingWidth_ = paddingWidth;
    return *this;
  }

-  inline MaxPoolingOperatorTester& paddingTop(uint32_t paddingTop) {
-    this->paddingTop_ = paddingTop;
-    return *this;
+  inline uint32_t paddingHeight() const {
+    return this->paddingHeight_;
  }

-  inline uint32_t paddingTop() const {
-    return this->paddingTop_;
-  }
-
-  inline MaxPoolingOperatorTester& paddingRight(uint32_t paddingRight) {
-    this->paddingRight_ = paddingRight;
-    return *this;
-  }
-
-  inline uint32_t paddingRight() const {
-    return this->paddingRight_;
-  }
-
-  inline MaxPoolingOperatorTester& paddingBottom(uint32_t paddingBottom) {
-    this->paddingBottom_ = paddingBottom;
-    return *this;
-  }
-
-  inline uint32_t paddingBottom() const {
-    return this->paddingBottom_;
-  }
-
-  inline MaxPoolingOperatorTester& paddingLeft(uint32_t paddingLeft) {
-    this->paddingLeft_ = paddingLeft;
-    return *this;
-  }
-
-  inline uint32_t paddingLeft() const {
-    return this->paddingLeft_;
+  inline uint32_t paddingWidth() const {
+    return this->paddingWidth_;
  }

  inline MaxPoolingOperatorTester& inputSize(
@ -255,8 +221,7 @@ class MaxPoolingOperatorTester {
  }

  inline size_t outputHeight() const {
-    const size_t paddedInputHeight =
-        paddingTop() + inputHeight() + paddingBottom();
+    const size_t paddedInputHeight = inputHeight() + paddingHeight() * 2;
    if (paddedInputHeight <= dilatedPoolingHeight()) {
      return 1;
    } else {
@ -265,8 +230,7 @@ class MaxPoolingOperatorTester {
  }

  inline size_t outputWidth() const {
-    const size_t paddedInputWidth =
-        paddingLeft() + inputWidth() + paddingRight();
+    const size_t paddedInputWidth = inputWidth() + paddingWidth() * 2;
    if (paddedInputWidth <= dilatedPoolingWidth()) {
      return 1;
    } else {
@ -344,7 +308,7 @@ class MaxPoolingOperatorTester {

  inline size_t nextOutputHeight() const {
    const size_t paddedNextInputHeight =
-        paddingTop() + nextInputHeight() + paddingBottom();
+        nextInputHeight() + paddingHeight() * 2;
    if (paddedNextInputHeight <= dilatedPoolingHeight()) {
      return 1;
    } else {
@ -354,8 +318,7 @@ class MaxPoolingOperatorTester {
  }

  inline size_t nextOutputWidth() const {
-    const size_t paddedNextInputWidth =
-        paddingLeft() + nextInputWidth() + paddingRight();
+    const size_t paddedNextInputWidth = nextInputWidth() + paddingWidth() * 2;
    if (paddedNextInputWidth <= dilatedPoolingWidth()) {
      return 1;
    } else {
@ -429,11 +392,11 @@ class MaxPoolingOperatorTester {
            for (size_t c = 0; c < channels(); c++) {
              uint8_t maxValue = 0;
              for (size_t py = 0; py < poolingHeight(); py++) {
-                const size_t iy =
-                    oy * strideHeight() + py * dilationHeight() - paddingTop();
+                const size_t iy = oy * strideHeight() + py * dilationHeight() -
+                    paddingHeight();
                for (size_t px = 0; px < poolingWidth(); px++) {
-                  const size_t ix =
-                      ox * strideWidth() + px * dilationWidth() - paddingLeft();
+                  const size_t ix = ox * strideWidth() + px * dilationWidth() -
+                      paddingWidth();
                  if (ix < inputWidth() && iy < inputHeight()) {
                    maxValue = std::max(
                        maxValue,
@ -462,10 +425,8 @@ class MaxPoolingOperatorTester {
      ASSERT_EQ(
          pytorch_qnnp_status_success,
          pytorch_qnnp_create_max_pooling2d_nhwc_u8(
-              paddingTop(),
-              paddingRight(),
-              paddingBottom(),
-              paddingLeft(),
+              paddingHeight(),
+              paddingWidth(),
              poolingHeight(),
              poolingWidth(),
              strideHeight(),
@ -569,11 +530,11 @@ class MaxPoolingOperatorTester {
            for (size_t c = 0; c < channels(); c++) {
              uint8_t maxValue = 0;
              for (size_t py = 0; py < poolingHeight(); py++) {
-                const size_t iy =
-                    oy * strideHeight() + py * dilationHeight() - paddingTop();
+                const size_t iy = oy * strideHeight() + py * dilationHeight() -
+                    paddingHeight();
                for (size_t px = 0; px < poolingWidth(); px++) {
-                  const size_t ix =
-                      ox * strideWidth() + px * dilationWidth() - paddingLeft();
+                  const size_t ix = ox * strideWidth() + px * dilationWidth() -
+                      paddingWidth();
                  if (ix < inputWidth() && iy < inputHeight()) {
                    maxValue = std::max(
                        maxValue,
@ -602,10 +563,8 @@ class MaxPoolingOperatorTester {
      ASSERT_EQ(
          pytorch_qnnp_status_success,
          pytorch_qnnp_create_max_pooling2d_nhwc_u8(
-              paddingTop(),
-              paddingRight(),
-              paddingBottom(),
-              paddingLeft(),
+              paddingHeight(),
+              paddingWidth(),
              poolingHeight(),
              poolingWidth(),
              strideHeight(),
@ -680,11 +639,11 @@ class MaxPoolingOperatorTester {
            for (size_t c = 0; c < channels(); c++) {
              uint8_t maxValue = 0;
              for (size_t py = 0; py < poolingHeight(); py++) {
-                const size_t iy =
-                    oy * strideHeight() + py * dilationHeight() - paddingTop();
+                const size_t iy = oy * strideHeight() + py * dilationHeight() -
+                    paddingHeight();
                for (size_t px = 0; px < poolingWidth(); px++) {
-                  const size_t ix =
-                      ox * strideWidth() + px * dilationWidth() - paddingLeft();
+                  const size_t ix = ox * strideWidth() + px * dilationWidth() -
+                      paddingWidth();
                  if (ix < nextInputWidth() && iy < nextInputHeight()) {
                    maxValue = std::max(
                        maxValue,
@ -775,10 +734,8 @@ class MaxPoolingOperatorTester {
  }

 private:
-  uint32_t paddingTop_{0};
-  uint32_t paddingRight_{0};
-  uint32_t paddingBottom_{0};
-  uint32_t paddingLeft_{0};
+  uint32_t paddingHeight_{0};
+  uint32_t paddingWidth_{0};
  size_t inputHeight_{1};
  size_t inputWidth_{1};
  size_t channels_{1};
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/max-pooling.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/max-pooling.cc
@ -50,14 +50,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_small_1xM_pool_with_padding) {
       channels += 3) {
    for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
         poolSize++) {
-      for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
+      for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
        for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
          MaxPoolingOperatorTester()
              .batchSize(1)
              .inputHeight(2)
              .inputWidth(poolSize + 2)
-              .paddingLeft(paddingLeft)
-              .paddingRight(paddingRight)
+              .paddingWidth(paddingWidth)
              .poolingHeight(1)
              .poolingWidth(poolSize)
              .channels(channels)
@ -134,14 +133,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_small_Mx1_pool_with_padding) {
       channels += 3) {
    for (size_t poolSize = 2; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
          MaxPoolingOperatorTester()
              .batchSize(1)
              .inputHeight(poolSize + 1)
              .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+              .paddingHeight(paddingHeight)
              .poolingHeight(poolSize)
              .poolingWidth(1)
              .channels(channels)
@ -335,14 +333,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_large_1xM_pool_with_padding) {
       channels += 3) {
    for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
         poolSize++) {
-      for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
+      for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
        for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
          MaxPoolingOperatorTester()
              .batchSize(1)
              .inputHeight(2)
              .inputWidth(poolSize + 2)
-              .paddingLeft(paddingLeft)
-              .paddingRight(paddingRight)
+              .paddingWidth(paddingWidth)
              .poolingHeight(1)
              .poolingWidth(poolSize)
              .channels(channels)
@ -423,14 +420,13 @@ TEST(MAX_POOLING_OP, unit_batch_many_channels_large_Mx1_pool_with_padding) {
    for (size_t poolSize = pytorch_qnnp_params.u8maxpool.mr; poolSize <=
         pytorch_qnnp_params.u8maxpool.mr + pytorch_qnnp_params.u8maxpool.qr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
          MaxPoolingOperatorTester()
              .batchSize(1)
              .inputHeight(poolSize + 1)
              .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+              .paddingHeight(paddingHeight)
              .poolingHeight(poolSize)
              .poolingWidth(1)
              .channels(channels)
@ -627,14 +623,13 @@ TEST(MAX_POOLING_OP, unit_batch_few_channels_1xM_pool_with_padding) {
       channels++) {
    for (size_t poolSize = 3; poolSize <= pytorch_qnnp_params.u8maxpool.mr;
         poolSize++) {
-      for (size_t paddingLeft = 0; paddingLeft <= 1; paddingLeft++) {
+      for (size_t paddingWidth = 0; paddingWidth <= 1; paddingWidth++) {
        for (size_t paddingRight = 0; paddingRight <= 1; paddingRight++) {
          MaxPoolingOperatorTester()
              .batchSize(1)
              .inputHeight(2)
              .inputWidth(poolSize + 2)
-              .paddingLeft(paddingLeft)
-              .paddingRight(paddingRight)
+              .paddingWidth(paddingWidth)
              .poolingHeight(1)
              .poolingWidth(poolSize)
              .channels(channels)
@ -707,14 +702,13 @@ TEST(MAX_POOLING_OP, unit_batch_few_channels_Mx1_pool_with_padding) {
       channels++) {
    for (size_t poolSize = 2; poolSize <= 2 * pytorch_qnnp_params.u8maxpool.kr;
         poolSize++) {
-      for (size_t paddingTop = 0; paddingTop <= 1; paddingTop++) {
+      for (size_t paddingHeight = 0; paddingHeight <= 1; paddingHeight++) {
        for (size_t paddingBottom = 0; paddingBottom <= 1; paddingBottom++) {
          MaxPoolingOperatorTester()
              .batchSize(1)
              .inputHeight(poolSize + 1)
              .inputWidth(3)
-              .paddingTop(paddingTop)
-              .paddingBottom(paddingBottom)
+              .paddingHeight(paddingHeight)
              .poolingHeight(poolSize)
              .poolingWidth(1)
              .channels(channels)
--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
@ -183,12 +183,9 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
    convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
    convolution->dilation_height = dilation_[kSpatialDim - 2];
    convolution->dilation_width = dilation_[kSpatialDim - 1];
-    convolution->input_padding_top = padding_[kSpatialDim - 2];
-    convolution->input_padding_left = padding_[kSpatialDim - 1];
-    convolution->input_padding_bottom = padding_[kSpatialDim - 2];
-    convolution->input_padding_right = padding_[kSpatialDim - 1];
-    convolution->input_padding_front = kSpatialDim == 3 ? padding_[0] : 0;
-    convolution->input_padding_back = kSpatialDim == 3 ? padding_[0] : 0;
+    convolution->input_padding_height = padding_[kSpatialDim - 2];
+    convolution->input_padding_width = padding_[kSpatialDim - 1];
+    convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
    convolution->per_channel = is_per_channel;
    convolution->transpose = transpose_;

--- a/aten/src/ATen/native/quantized/cpu/qpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp
@ -298,10 +298,8 @@ void check_maxpool2d_params(

   const pytorch_qnnp_status createStatus =
       pytorch_qnnp_create_max_pooling2d_nhwc_u8(
-           padH /* input_padding_top */,
-           padW /* input_padding_right */,
-           padH /* input_padding_bottom */,
-           padW /* input_padding_left */,
+           padH /* input_padding_height */,
+           padW /* input_padding_width */,
           kH /* pooling height */,
           kW /* pooling width */,
           strideH /* stride height */,
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@ -270,7 +270,7 @@ Tensor _sparse_csr_addmm(
 Tensor add_sparse_csr(const Tensor& self, const Tensor& other, const Scalar& alpha) {
  auto commonDtype = at::result_type(self, other);
  alpha_check(commonDtype, alpha);
-  Tensor result = at::empty({0}, self.options().dtype(commonDtype));
+  Tensor result = at::empty({0, 0}, self.options().dtype(commonDtype));
  return at::add_out(result, self, other, alpha); // redispatch!
 }

--- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
@ -147,5 +147,30 @@ Tensor& addmv_out_sparse_csr_cuda(
  return result;
 }

+/*
+  Solves a system of linear equations whose coefficients are represented in a sparse triangular matrix A:
+  op(A) X = B.
+
+  Args:
+  * `B` - dense Tensor of size m × nrhs.
+  * `A` - sparse Tensor of size m × m.
+  * `upper` - controls whether upper or lower triangular part of A is considered in computations.
+  * `transpose` - if true then op(A) = A^T.
+  * `unitriangular` - if true then the diagonal elements of A are assumed to be one.
+  * `X` - dense Tensor of size m × nrhs.
+  * `clone_A` - cloned matrix A, required only for compatibility with strided layout interface.
+*/
+std::tuple<Tensor&, Tensor&> triangular_solve_out_sparse_csr_cuda(
+    const Tensor& B,
+    const Tensor& A,
+    bool upper,
+    bool transpose,
+    bool unitriangular,
+    Tensor& X,
+    Tensor& clone_A) {
+  sparse::impl::cuda::triangular_solve_out_sparse_csr(A, B, X, upper, transpose, unitriangular);
+  return std::tuple<Tensor&, Tensor&>(X, clone_A);
+}
+
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@ -1,8 +1,10 @@
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDADataType.h>
 #include <ATen/cuda/CUDASparse.h>
+#include <ATen/cuda/CUDASparseBlas.h>
 #include <ATen/cuda/CUDASparseDescriptors.h>
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/sparse/cuda/SparseBlasImpl.h>
 #include <ATen/native/sparse/cuda/SparseBlasLegacy.h>

@ -452,6 +454,326 @@ void addmv_out_sparse_csr(
 #endif
 }

+/*
+  Computes C = alpha * A + beta * B
+
+  Args:
+  * `A` - [in] sparse Tensor of size m × n.
+  * `B` - [in] sparse Tensor of size m × n.
+  * `C` - [out] sparse Tensor of size m × n.
+*/
+void add_out_sparse_csr(
+    const at::sparse_csr::SparseCsrTensor& A,
+    const at::sparse_csr::SparseCsrTensor& B,
+    const Scalar& alpha,
+    const Scalar& beta,
+    const at::sparse_csr::SparseCsrTensor& C) {
+  IntArrayRef A_sizes = A.sizes();
+  auto ndim = A.dim();
+  int m = at::native::cuda_int_cast(A_sizes[ndim - 2], "m");
+  int n = at::native::cuda_int_cast(A_sizes[ndim - 1], "n");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.sizes().equals(B.sizes()) && A.sizes().equals(C.sizes()));
+
+  // Only 32-bit indices are supported
+  auto A_32 = at::native::_sparse_csr_tensor_unsafe(
+      A.crow_indices().to(kInt),
+      A.col_indices().to(kInt),
+      A.values(),
+      A.sizes(),
+      A.scalar_type(),
+      A.layout(),
+      A.device());
+  auto B_32 = at::native::_sparse_csr_tensor_unsafe(
+      B.crow_indices().to(kInt),
+      B.col_indices().to(kInt),
+      B.values(),
+      B.sizes(),
+      B.scalar_type(),
+      B.layout(),
+      B.device());
+
+  // Modify C tensor in-place to swap indices tensors with 32-bit variants
+  indices_to_32_bit_inplace(C);
+
+  int nnzA = at::native::cuda_int_cast(A_32._nnz(), "nnzA");
+  int nnzB = at::native::cuda_int_cast(B_32._nnz(), "nnzB");
+
+  auto desc = at::cuda::sparse::CuSparseMatDescriptor();
+
+  auto A_crow_indices = A_32.crow_indices();
+  auto B_crow_indices = B_32.crow_indices();
+  auto C_crow_indices = C.crow_indices();
+  auto A_crow_indices_ptr = A_crow_indices.data_ptr<int>();
+  auto B_crow_indices_ptr = B_crow_indices.data_ptr<int>();
+  auto C_crow_indices_ptr = C_crow_indices.data_ptr<int>();
+
+  auto A_col_indices = A_32.col_indices();
+  auto B_col_indices = B_32.col_indices();
+  auto C_col_indices = C.col_indices();
+  auto A_col_indices_ptr = A_col_indices.data_ptr<int>();
+  auto B_col_indices_ptr = B_col_indices.data_ptr<int>();
+  auto C_col_indices_ptr = C_col_indices.data_ptr<int>();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      C.scalar_type(), "add_out_sparse_csr_cuda_impl", [&] {
+        auto beta_ = beta.to<scalar_t>();
+        auto alpha_ = alpha.to<scalar_t>();
+
+        auto A_values = A_32.values();
+        auto B_values = B_32.values();
+        auto C_values = C.values();
+        auto A_values_ptr = A_values.data_ptr<scalar_t>();
+        auto B_values_ptr = B_values.data_ptr<scalar_t>();
+        auto C_values_ptr = C_values.data_ptr<scalar_t>();
+
+        auto handle = at::cuda::getCurrentCUDASparseHandle();
+        TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
+
+        size_t buffer_size;
+        at::cuda::sparse::csrgeam2_bufferSizeExt<scalar_t>(
+            handle,
+            m,
+            n,
+            &alpha_,
+            desc.descriptor(),
+            nnzA,
+            A_values_ptr,
+            A_crow_indices_ptr,
+            A_col_indices_ptr,
+            &beta_,
+            desc.descriptor(),
+            nnzB,
+            B_values_ptr,
+            B_crow_indices_ptr,
+            B_col_indices_ptr,
+            desc.descriptor(),
+            C_values_ptr,
+            C_crow_indices_ptr,
+            C_col_indices_ptr,
+            &buffer_size // output
+        );
+
+        auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+        auto work_data = allocator.allocate(buffer_size);
+
+        int nnzC = -1;
+        at::cuda::sparse::csrgeam2Nnz<scalar_t>(
+            handle,
+            m,
+            n,
+            desc.descriptor(),
+            nnzA,
+            A_crow_indices_ptr,
+            A_col_indices_ptr,
+            desc.descriptor(),
+            nnzB,
+            B_crow_indices_ptr,
+            B_col_indices_ptr,
+            desc.descriptor(),
+            C_crow_indices_ptr,
+            &nnzC,
+            work_data.get());
+
+        // Resize result using nnz information from cusparse
+        col_indices_and_values_resize_(C, nnzC);
+        C_col_indices = C.col_indices();
+        C_values = C.values();
+
+        C_col_indices_ptr = C_col_indices.data_ptr<int>();
+        C_values_ptr = C_values.data_ptr<scalar_t>();
+
+        at::cuda::sparse::csrgeam2<scalar_t>(
+            handle,
+            m,
+            n,
+            &alpha_,
+            desc.descriptor(),
+            nnzA,
+            A_values_ptr,
+            A_crow_indices_ptr,
+            A_col_indices_ptr,
+            &beta_,
+            desc.descriptor(),
+            nnzB,
+            B_values_ptr,
+            B_crow_indices_ptr,
+            B_col_indices_ptr,
+            desc.descriptor(),
+            C_values_ptr,
+            C_crow_indices_ptr,
+            C_col_indices_ptr,
+            work_data.get());
+      });
+}
+
+/*
+  Solves a system of linear equations whose coefficients are represented in a sparse triangular matrix A:
+  op(A) X = B.
+
+  Args:
+  * `A` - sparse Tensor of size m × m.
+  * `B` - dense Tensor of size m × nrhs.
+  * `X` - dense Tensor of size m × nrhs.
+  * `upper` - controls whether upper or lower triangular part of A is considered in computations.
+  * `transpose` - if true then op(A) = A^T.
+  * `unitriangular` - if true then the diagonal elements of A are assumed to be one.
+*/
+void triangular_solve_out_sparse_csr(
+    const at::sparse_csr::SparseCsrTensor& A,
+    const Tensor& B,
+    const Tensor& X,
+    bool upper,
+    bool transpose,
+    bool unitriangular) {
+#if !AT_USE_CUSPARSE_GENERIC_SPSV()
+  TORCH_CHECK(
+      false,
+      "Calling triangular solve on a sparse GPU tensor requires compiling ",
+      "PyTorch with at least CUDA 11.3. ",
+      "Please use PyTorch built with newer CUDA version.");
+#else
+  if (B.numel() == 0 || X.numel() == 0 || A._nnz() == 0) {
+    return;
+  }
+
+  c10::MaybeOwned<Tensor> B_ = prepare_dense_matrix_for_cusparse(B);
+  c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X);
+
+  // TODO: update this to support COO sparse layout
+  auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(A);
+  descA.set_mat_fill_mode(upper);
+  descA.set_mat_diag_type(unitriangular);
+  cusparseOperation_t opA = transpose ? CUSPARSE_OPERATION_TRANSPOSE
+                                      : CUSPARSE_OPERATION_NON_TRANSPOSE;
+
+  if (B.size(-1) == 1) {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+        X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
+          scalar_t alpha = 1;
+          cudaDataType compute_type = at::cuda::getCudaDataType<scalar_t>();
+          auto handle = at::cuda::getCurrentCUDASparseHandle();
+          size_t buffer_size;
+
+          auto desc_spsv = at::cuda::sparse::CuSparseSpSVDescriptor();
+          auto descB = at::cuda::sparse::CuSparseDnVecDescriptor(*B_);
+          auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*X_);
+          TORCH_CUDASPARSE_CHECK(cusparseSpSV_bufferSize(
+              handle,
+              opA,
+              &alpha,
+              descA.descriptor(),
+              descB.descriptor(),
+              descX.descriptor(),
+              compute_type,
+              CUSPARSE_SPSV_ALG_DEFAULT,
+              desc_spsv.descriptor(),
+              &buffer_size // output
+              ));
+
+          auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+          auto work_data = allocator.allocate(buffer_size);
+
+          TORCH_CUDASPARSE_CHECK(cusparseSpSV_analysis(
+              handle,
+              opA,
+              &alpha,
+              descA.descriptor(),
+              descB.descriptor(),
+              descX.descriptor(),
+              compute_type,
+              CUSPARSE_SPSV_ALG_DEFAULT,
+              desc_spsv.descriptor(),
+              work_data.get()));
+
+          TORCH_CUDASPARSE_CHECK(cusparseSpSV_solve(
+              handle,
+              opA,
+              &alpha,
+              descA.descriptor(),
+              descB.descriptor(),
+              descX.descriptor(),
+              compute_type,
+              CUSPARSE_SPSV_ALG_DEFAULT,
+              desc_spsv.descriptor()));
+        });
+  } else {
+#if !AT_USE_CUSPARSE_GENERIC_SPSM()
+    TORCH_CHECK(
+        false,
+        "Calling triangular solve on a sparse GPU tensor requires compiling ",
+        "PyTorch with at least CUDA 11.3.1. ",
+        "Please use PyTorch built with newer CUDA version.");
+#else
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+        X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
+          scalar_t alpha = 1;
+          cudaDataType compute_type = at::cuda::getCudaDataType<scalar_t>();
+          auto handle = at::cuda::getCurrentCUDASparseHandle();
+          size_t buffer_size;
+
+          // TODO: support mixed memory format
+          IntArrayRef X_strides = X_->strides();
+          IntArrayRef B_strides = B_->strides();
+          auto ndim = X_->dim();
+          bool is_X_row_major = (X_strides[ndim - 1] == 1);
+          bool is_B_row_major = (B_strides[ndim - 1] == 1);
+          TORCH_INTERNAL_ASSERT(is_X_row_major && is_B_row_major);
+
+          cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+          auto desc_spsm = at::cuda::sparse::CuSparseSpSMDescriptor();
+          auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(*B_);
+          auto descX = at::cuda::sparse::CuSparseDnMatDescriptor(*X_);
+          TORCH_CUDASPARSE_CHECK(cusparseSpSM_bufferSize(
+              handle,
+              opA,
+              opB,
+              &alpha,
+              descA.descriptor(),
+              descB.descriptor(),
+              descX.descriptor(),
+              compute_type,
+              CUSPARSE_SPSM_ALG_DEFAULT,
+              desc_spsm.descriptor(),
+              &buffer_size // output
+              ));
+
+          auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+          auto work_data = allocator.allocate(buffer_size);
+
+          TORCH_CUDASPARSE_CHECK(cusparseSpSM_analysis(
+              handle,
+              opA,
+              opB,
+              &alpha,
+              descA.descriptor(),
+              descB.descriptor(),
+              descX.descriptor(),
+              compute_type,
+              CUSPARSE_SPSM_ALG_DEFAULT,
+              desc_spsm.descriptor(),
+              work_data.get()));
+
+          TORCH_CUDASPARSE_CHECK(cusparseSpSM_solve(
+              handle,
+              opA,
+              opB,
+              &alpha,
+              descA.descriptor(),
+              descB.descriptor(),
+              descX.descriptor(),
+              compute_type,
+              CUSPARSE_SPSM_ALG_DEFAULT,
+              desc_spsm.descriptor()));
+        });
+#endif // !AT_USE_CUSPARSE_GENERIC_SPSM()
+  }
+  if (!X.is_same(*X_)) {
+    X.copy_(*X_);
+  }
+#endif // !AT_USE_CUSPARSE_GENERIC_SPSV()
+}
+
 } // namespace cuda
 } // namespace impl
 } // namespace sparse
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.h
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.h
@ -24,6 +24,21 @@ void addmv_out_sparse_csr(
    const Scalar& alpha,
    const Tensor& result);

+void add_out_sparse_csr(
+    const at::sparse_csr::SparseCsrTensor& mat1,
+    const at::sparse_csr::SparseCsrTensor& mat2,
+    const Scalar& alpha,
+    const Scalar& beta,
+    const at::sparse_csr::SparseCsrTensor& result);
+
+void triangular_solve_out_sparse_csr(
+    const at::sparse_csr::SparseCsrTensor& A,
+    const Tensor& B,
+    const Tensor& X,
+    bool upper,
+    bool transpose,
+    bool unitriangular);
+
 } // namespace cuda
 } // namespace impl
 } // namespace sparse
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@ -19,6 +19,7 @@
 #include <ATen/cuda/ThrustAllocator.h>
 #include <c10/cuda/CUDACachingAllocator.h>

+#include <ATen/native/sparse/cuda/SparseBlasImpl.h>
 #include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>

@ -187,11 +188,16 @@ Tensor& add_out_sparse_csr_cuda(
    const Scalar& alpha,
    SparseCsrTensor& out) {
  if (self.layout() == kStrided) {
-    return add_out_dense_sparse_csr_cuda(out, self, other, alpha);
+    add_out_dense_sparse_csr_cuda(out, self, other, alpha);
  } else {
    TORCH_CHECK(
-        false,
-        "NotImplementedError: Addition of sparse CSR tensors is not yet implemented.")
+        self.sizes().equals(other.sizes()),
+        "torch.add: Expected input tensors to have the same shape, but got tensor `self` with shape ",
+        self.sizes(),
+        " and tensor `other` with shape ",
+        other.sizes());
+    at::native::resize_as_sparse_csr_(out, self);
+    sparse::impl::cuda::add_out_sparse_csr(self, other, Scalar(1), alpha, out);
  }
  return out;
 }
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@ -145,6 +145,23 @@ void Context::flush() {
  command().pool.purge();
 }

+void Context::wait(const at::Tensor& src) {
+  // wait only if Vulkan tensor
+  if (at::kVulkan == src.device().type()) {
+    api::Command::Pool& command_pool = command().pool;
+    api::Command::Buffer& command_buffer = command_pool.stream();
+
+    using Future = ops::vTensor::Future<const void, ops::vTensor::Access::Read>;
+    const ops::vTensor& v_src = ops::convert(src);
+    const Future v_src_future = v_src.host<const void>(command_buffer);
+
+    // This wait() is a no-op if data is not out of sync.  More often than
+    // not though, waits here are expected as the GPU catches up with
+    // compute submitted from CPU.
+    v_src_future.wait();
+  }
+}
+
 bool available() {
  return context();
 }
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@ -56,6 +56,10 @@ class Context final {

  void flush();

+  // Use this function only for debugging and testing when you want to make sure
+  // all GPU operations get finished before calling flush(). Otherwise, it may crash.
+  void wait(const at::Tensor& src);
+
 private:
  VkDevice device();
  VkQueue queue();
--- a/aten/src/ATen/native/vulkan/api/Helper.cpp
+++ b/aten/src/ATen/native/vulkan/api/Helper.cpp
@ -12,17 +12,23 @@ void copy_texture_to_texture(
    api::Command::Buffer& command_buffer,
    api::Resource::Image::Object& src_image,
    api::Resource::Image::Object& dst_image,
-    api::utils::uvec3 src_extents,
+    api::utils::uvec3 copy_extents,
+    api::utils::uvec3 src_offset,
    api::utils::uvec3 dst_offset) {
  VkImageCopy copy_info{};
  copy_info.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
  copy_info.srcSubresource.layerCount = 1;
  copy_info.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
  copy_info.dstSubresource.layerCount = 1;
-  copy_info.extent.width = src_extents.data[0u];
-  copy_info.extent.height = src_extents.data[1u];
-  copy_info.extent.depth = src_extents.data[2u];
+  copy_info.extent.width = copy_extents.data[0u];
+  copy_info.extent.height = copy_extents.data[1u];
+  copy_info.extent.depth = copy_extents.data[2u];
+  copy_info.srcOffset.x = src_offset.data[0u];
+  copy_info.srcOffset.y = src_offset.data[1u];
+  copy_info.srcOffset.z = src_offset.data[2u];
+  copy_info.dstOffset.x = dst_offset.data[0u];
  copy_info.dstOffset.y = dst_offset.data[1u];
+  copy_info.dstOffset.z = dst_offset.data[2u];

  // To use vkCmdCopyImage, the stage of src & dst image must be set to vTensor::Stage::Transfer.
  vkCmdCopyImage(
--- a/aten/src/ATen/native/vulkan/api/Helper.h
+++ b/aten/src/ATen/native/vulkan/api/Helper.h
@ -18,7 +18,8 @@ void copy_texture_to_texture(
    api::Command::Buffer& command_buffer,
    api::Resource::Image::Object& src_image,
    api::Resource::Image::Object& dst_image,
-    api::utils::uvec3 src_extents,
+    api::utils::uvec3 copy_extents,
+    api::utils::uvec3 src_offset,
    api::utils::uvec3 dst_offset);

 } // namespace utils
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@ -96,6 +96,61 @@ Tensor cat_feature(const TensorList tensors, vTensor& v_output) {
  return convert(v_output);
 }

+Tensor cat_feature_mult4ch(const TensorList tensors, vTensor& v_output) {
+  api::Context* const context = api::context();
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+
+  int64_t depth_size_allprior = 0;
+  int64_t ch_interval = 0;
+  for (const auto& tensor : tensors) {
+    ch_interval += tensor.sizes()[1];
+  }
+  const int64_t depth_interval = ch_interval / 4;
+
+  auto dst_image = v_output.image(
+    command_buffer,
+    vTensor::Stage::Transfer,
+    vTensor::Access::Write);
+  uvec3 src_offset{};
+  uvec3 dst_offset{};
+
+  for (const auto& tensor : tensors) {
+    const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
+    const vTensor& v_self = convert(self);
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      auto src_image = v_self.image(
+              command_buffer,
+              vTensor::Stage::Transfer);
+
+      const uint32_t depth_slice = safe_downcast<uint32_t>(tensor.sizes()[1] / 4);
+      uvec3 copy_extents {v_self.extents().data[0u],
+        v_self.extents().data[1u],
+        depth_slice};
+
+      for (int b = 0; b < tensor.sizes()[0]; ++b) {
+        src_offset.data[2u] = safe_downcast<uint32_t>(depth_slice * b);
+        dst_offset.data[2u] = depth_size_allprior + safe_downcast<uint32_t>(depth_interval * b);
+        api::helper::copy_texture_to_texture(command_buffer,
+          src_image,
+          dst_image,
+          copy_extents,
+          src_offset,
+          dst_offset);
+      }
+
+      depth_size_allprior += depth_slice;
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return convert(v_output);
+}
+
 Tensor cat_width(const TensorList tensors, vTensor& v_output) {
  TORCH_CHECK(false, "Vulkan cat not implemented for width dimension!");
 }
@ -110,6 +165,7 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
    vTensor::Stage::Transfer,
    vTensor::Access::Write);

+  uvec3 src_offset{};
  uvec3 dst_offset{};
  for (const auto& tensor : tensors) {
    const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
@ -123,6 +179,7 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) {
        src_image,
        dst_image,
        v_self.extents(),
+        src_offset,
        dst_offset);

      // Increment by height
@ -148,11 +205,16 @@ Tensor cat(

  at::Tensor tensor = tensors[0];
  int64_t cat_dim_size = 0;
+  bool is_mult4ch = true;

  for (const auto & t : tensors) {
     TORCH_INTERNAL_ASSERT(
      t.dim() == 4, "Vulkan cat expects 4 dimensional inputs");

+    if (t.sizes()[1] % 4 != 0) {
+      is_mult4ch = false;
+    }
+
    for (int d = 0; d < 4; ++d) {
      if (d == dim) {
        continue;
@ -179,6 +241,9 @@ Tensor cat(
    return cat_height(tensors, v_output);
  }
  else if (dim == 1) {
+    if (is_mult4ch) {
+      return cat_feature_mult4ch(tensors, v_output);
+    }
    return cat_feature(tensors, v_output);
  }
  return cat_batch(tensors, v_output);
--- a/Show More
+++ b/Show More