Update on "Add type refinement for Union"

Differential Revision: [D28572811](https://our.internmc.facebook.com/intern/diff/D28572811) [ghstack-poisoned]
2025-12-06 12:20:52 +01:00 · 2021-08-30 14:29:59 -07:00 · 2021-08-30 14:29:59 -07:00 · b3ece930a1
commit b3ece930a1
parent 21ca6b4ce5 5f675d118b
284 changed files with 11376 additions and 5248 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -3,7 +3,11 @@ build --copt=-I.
 build --copt=-isystem --copt bazel-out/k8-fastbuild/bin

 # Configuration to disable tty features for environments like CI
-
 build:no-tty --curses no
 build:no-tty --progress_report_interval 10
 build:no-tty --show_progress_rate_limit 10
+
+# Configuration to build with GPU support
+build:gpu --define=cuda=true
+# define a separate build folder for faster switching between configs
+build:gpu --platform_suffix=-gpu
--- a/.circleci/scripts/binary_macos_build.sh
+++ b/.circleci/scripts/binary_macos_build.sh
@ -14,6 +14,9 @@ chmod +x "$build_script"
 # Build
 cat >"$build_script" <<EOL
 export PATH="$workdir/miniconda/bin:$PATH"
+if [[ "$CIRCLE_BRANCH" == "nightly" ]]; then
+  export USE_PYTORCH_METAL_EXPORT=1
+fi
 if [[ "$PACKAGE_TYPE" == conda ]]; then
  "$workdir/builder/conda/build_pytorch.sh"
 else
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@ -10,18 +10,27 @@ pt_checkout="/var/lib/jenkins/workspace"
 # Since we're cat-ing this file, we need to escape all $'s
 echo "cpp_doc_push_script.sh: Invoked with $*"

-# Argument 1: Where to copy the built documentation for Python API to
-# (pytorch.github.io/$install_path)
-install_path="$1"
-if [ -z "$install_path" ]; then
-echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
+# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
+# the order of operations goes:
+#   1. Check if there's an argument $1
+#   2. If no argument check for environment var DOCS_INSTALL_PATH
+#   3. If no environment var fall back to default 'docs/'
+
+# NOTE: It might seem weird to gather the second argument before gathering the first argument
+#       but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
+#       try and gather it first, just so we don't potentially break people who rely on this script
+# Argument 2: What version of the Python API docs we are building.
+version="${2:-${DOCS_VERSION:-master}}"
+if [ -z "$version" ]; then
+echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
  exit 1
 fi

-# Argument 2: What version of the Python API docs we are building.
-version="$2"
-if [ -z "$version" ]; then
-echo "error: cpp_doc_push_script.sh: version (arg2) not specified"
+# Argument 1: Where to copy the built documentation for Python API to
+# (pytorch.github.io/$install_path)
+install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
+if [ -z "$install_path" ]; then
+echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified"
  exit 1
 fi

--- a/.circleci/scripts/python_doc_push_script.sh
+++ b/.circleci/scripts/python_doc_push_script.sh
@ -13,18 +13,27 @@ echo "python_doc_push_script.sh: Invoked with $*"

 set -ex

-# Argument 1: Where to copy the built documentation to
-# (pytorch.github.io/$install_path)
-install_path="$1"
-if [ -z "$install_path" ]; then
-echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
+# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
+# the order of operations goes:
+#   1. Check if there's an argument $1
+#   2. If no argument check for environment var DOCS_INSTALL_PATH
+#   3. If no environment var fall back to default 'docs/'
+
+# NOTE: It might seem weird to gather the second argument before gathering the first argument
+#       but since DOCS_INSTALL_PATH can be derived from DOCS_VERSION it's probably better to
+#       try and gather it first, just so we don't potentially break people who rely on this script
+# Argument 2: What version of the docs we are building.
+version="${2:-${DOCS_VERSION:-master}}"
+if [ -z "$version" ]; then
+echo "error: python_doc_push_script.sh: version (arg2) not specified"
  exit 1
 fi

-# Argument 2: What version of the docs we are building.
-version="$2"
-if [ -z "$version" ]; then
-echo "error: python_doc_push_script.sh: version (arg2) not specified"
+# Argument 1: Where to copy the built documentation to
+# (pytorch.github.io/$install_path)
+install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
+if [ -z "$install_path" ]; then
+echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
  exit 1
 fi

@ -34,7 +43,7 @@ if [ "$version" == "master" ]; then
 fi

 # Argument 3: The branch to push to. Usually is "site"
-branch="$3"
+branch="${3:-${DOCS_BRANCH:-site}}"
 if [ -z "$branch" ]; then
 echo "error: python_doc_push_script.sh: branch (arg3) not specified"
  exit 1
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -148,6 +148,8 @@ class CIWorkflow:
    enable_nogpu_no_avx_test: YamlShellBool = "''"
    enable_nogpu_no_avx2_test: YamlShellBool = "''"
    enable_slow_test: YamlShellBool = "''"
+    enable_docs_test: YamlShellBool = "''"
+    enable_backwards_compat_test: YamlShellBool = "''"

    def __post_init__(self) -> None:
        if self.is_libtorch:
@ -266,6 +268,8 @@ LINUX_WORKFLOWS = [
        test_runner_type=LINUX_CPU_TEST_RUNNER,
        on_pull_request=True,
        enable_doc_jobs=True,
+        enable_docs_test=1,
+        enable_backwards_compat_test=1,
        num_test_shards=2,
        ciflow_config=CIFlowConfig(
            enabled=True,
@ -497,7 +501,7 @@ BAZEL_WORKFLOWS = [
    CIWorkflow(
        arch="linux",
        build_environment="linux-xenial-py3.6-gcc7-bazel-test",
-        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
        test_runner_type=LINUX_CPU_TEST_RUNNER,
        on_pull_request=True,
        ciflow_config=CIFlowConfig(
--- a/.github/scripts/generate_pytorch_test_matrix.py
+++ b/.github/scripts/generate_pytorch_test_matrix.py
@ -55,6 +55,10 @@ def main() -> None:
        configs['distributed'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
    if os.getenv('ENABLE_SLOW_TEST'):
        configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+    if os.getenv('ENABLE_DOCS_TEST'):
+        configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
+    if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'):
+        configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE}
    matrix = {
        'include': [
            {
--- a/.github/templates/linux_ci_workflow.yml.j2
+++ b/.github/templates/linux_ci_workflow.yml.j2
@ -254,6 +254,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }}
      ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }}
      ENABLE_SLOW_TEST: !{{ enable_slow_test }}
+      ENABLE_DOCS_TEST: !{{ enable_docs_test }}
+      ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }}
      NUM_TEST_SHARDS: !{{ num_test_shards }}
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -346,7 +348,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: !{{ build_environment }}-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
@ -441,11 +442,15 @@ jobs:
 {% endblock %}
 {%- endif -%}
 {%- if enable_doc_jobs %}
-  pytorch_python_doc_build:
+  pytorch_doc_build:
    runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        docs_type: [cpp, python]
    needs: [calculate-docker-image, build, !{{ ciflow_config.root_job_name }}]
    env:
      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      DOCS_TYPE: ${{ matrix.docs_type }}
    steps:
      - name: Log in to ECR
        run: |
@ -483,7 +488,7 @@ jobs:
      - name: Unzip artifacts
        run: |
          unzip -o artifacts.zip
-      - name: Build Python Doc in Docker
+      - name: Build ${{ matrix.docs_type }} docs
        run: |
          set -ex
          time docker pull "${DOCKER_IMAGE}" > /dev/null
@ -496,6 +501,8 @@ jobs:
            -e IN_CI \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e CIRCLE_SHA1="$GITHUB_SHA" \
+            -e DOCS_VERSION="${target}" \
+            -e DOCS_TYPE \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -505,34 +512,36 @@ jobs:
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}" \
-            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
+            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
-        if: ${{ github.event_name == 'pull_request' }}
-        name: Upload Docs Preview
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload Python Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
        with:
-          name: deploy
          retention-days: 14
          if-no-files-found: error
-          path: pytorch.github.io/docs/merge
-      - name: Show Docs Preview URL (Click Me)
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
+          path: pytorch.github.io/docs/merge/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload C++ Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          path: cppdocs/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
      - name: Archive artifacts into zip
        run: |
-          zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
+          zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
      - uses: actions/upload-artifact@v2
        name: Store PyTorch Build Artifacts
        with:
-          name: pytorch_github_io
+          name: docs_${{ matrix.docs_type }}
+          path: docs_${{ matrix.docs_type }}.zip
          if-no-files-found: error
-          path: pytorch_github_io.zip
      - name: Hold runner for 2 hours or until ssh sessions have drained
        # Always hold for active ssh sessions
        if: always()
--- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
+++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml
@ -230,6 +230,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: ''
      ENABLE_NOGPU_NO_AVX2_TEST: ''
      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
--- a/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
+++ b/.github/workflows/generated-linux-bionic-py3.8-gcc9-coverage.yml
@ -230,6 +230,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: ''
      ENABLE_NOGPU_NO_AVX2_TEST: ''
      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: linux-bionic-py3.8-gcc9-coverage-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
--- a/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda10.2-py3.6-gcc7.yml
@ -230,6 +230,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: 1
      ENABLE_NOGPU_NO_AVX2_TEST: 1
      ENABLE_SLOW_TEST: 1
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: linux-xenial-cuda10.2-py3.6-gcc7-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
--- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.6-gcc7.yml
@ -230,6 +230,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: ''
      ENABLE_NOGPU_NO_AVX2_TEST: ''
      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.6-gcc7-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc5.4.yml
@ -230,6 +230,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: ''
      ENABLE_NOGPU_NO_AVX2_TEST: ''
      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: 1
+      ENABLE_BACKWARDS_COMPAT_TEST: 1
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -325,7 +327,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc5.4-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
@ -429,11 +430,15 @@ jobs:
          # Prune all of the docker images
          docker system prune -af

-  pytorch_python_doc_build:
+  pytorch_doc_build:
    runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        docs_type: [cpp, python]
    needs: [calculate-docker-image, build, ciflow_should_run]
    env:
      DOCKER_IMAGE: ${{ needs.calculate-docker-image.outputs.docker_image }}
+      DOCS_TYPE: ${{ matrix.docs_type }}
    steps:
      - name: Log in to ECR
        run: |
@ -474,7 +479,7 @@ jobs:
      - name: Unzip artifacts
        run: |
          unzip -o artifacts.zip
-      - name: Build Python Doc in Docker
+      - name: Build ${{ matrix.docs_type }} docs
        run: |
          set -ex
          time docker pull "${DOCKER_IMAGE}" > /dev/null
@ -487,6 +492,8 @@ jobs:
            -e IN_CI \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e CIRCLE_SHA1="$GITHUB_SHA" \
+            -e DOCS_VERSION="${target}" \
+            -e DOCS_TYPE \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -496,34 +503,36 @@ jobs:
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
            "${DOCKER_IMAGE}" \
-            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/python_doc_push_script.sh docs/$target $target site"
+            bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh"
      - name: Chown workspace
        run: |
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
-      - uses: driazati/upload-artifact-s3@21c31d0a7bcb056ca50bd6ce197ba6507c26a1be
-        if: ${{ github.event_name == 'pull_request' }}
-        name: Upload Docs Preview
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload Python Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }}
        with:
-          name: deploy
          retention-days: 14
          if-no-files-found: error
-          path: pytorch.github.io/docs/merge
-      - name: Show Docs Preview URL (Click Me)
-        if: ${{ github.event_name == 'pull_request' }}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-        run: |
-          echo "See rendered docs at https://docs-preview.pytorch.org/$PR_NUMBER/"
+          path: pytorch.github.io/docs/merge/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Upload C++ Docs Preview
+        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cppdocs' }}
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          path: cppdocs/
+          s3-prefix: ${{ github.repository }}/pr-previews/pr/${{ github.event.pull_request.number }}/cppdocs
      - name: Archive artifacts into zip
        run: |
-          zip -r pytorch_github_io.zip "${GITHUB_WORKSPACE}/pytorch.github.io"
+          zip -r "docs_${DOCS_TYPE}.zip" "${GITHUB_WORKSPACE}/pytorch.github.io" "${GITHUB_WORKSPACE}/cppdocs"
      - uses: actions/upload-artifact@v2
        name: Store PyTorch Build Artifacts
        with:
-          name: pytorch_github_io
+          name: docs_${{ matrix.docs_type }}
+          path: docs_${{ matrix.docs_type }}.zip
          if-no-files-found: error
-          path: pytorch_github_io.zip
      - name: Hold runner for 2 hours or until ssh sessions have drained
        # Always hold for active ssh sessions
        if: always()
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7-bazel-test.yml
@ -15,7 +15,7 @@ on:

 env:
  BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7-bazel-test
-  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
  TORCH_CUDA_ARCH_LIST: 5.2
  IN_CI: 1
--- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
+++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.6-gcc7.yml
@ -228,6 +228,8 @@ jobs:
      ENABLE_NOGPU_NO_AVX_TEST: ''
      ENABLE_NOGPU_NO_AVX2_TEST: ''
      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
      NUM_TEST_SHARDS: 2
      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
      NOGPU_RUNNER_TYPE: linux.2xlarge
@ -323,7 +325,6 @@ jobs:
          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
      - name: Test PyTorch
        env:
-          BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.6-gcc7-${{ matrix.config }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          if [[ $TEST_CONFIG == 'multigpu' ]]; then
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@ -224,7 +224,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then

  get_bazel

+  # first build the whole torch for CPU-only
  tools/bazel build --config=no-tty :torch
+  # then build selected set of targets with GPU-support.
+  # TODO: eventually this should converge to building the whole :torch with GPU-support
+  tools/bazel build --config=no-tty --config=gpu :c10
 else
  # check that setup.py would fail with bad arguments
  echo "The next three invocations are expected to fail with invalid command error messages."
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -494,6 +494,10 @@ test_torch_deploy() {
  assert_git_not_dirty
 }

+test_docs_test() {
+  .jenkins/pytorch/docs-test.sh
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -517,7 +521,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || "$
  test_without_numpy
  install_torchvision
  test_python_shard1
-  test_distributed
  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || "${SHARD_NUMBER}" == 2 ]]; then
  install_torchvision
@ -533,6 +536,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *distributed* ]]; then
  test_distributed
  test_rpc
+elif [[ "${TEST_CONFIG}" = docs_test ]]; then
+  test_docs_test
 else
  install_torchvision
  install_monkeytype
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -214,6 +214,7 @@ option(USE_LMDB "Use LMDB" OFF)
 option(USE_MAGMA "Use MAGMA" ON)
 option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
 option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
+option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(
    USE_MLCOMPUTE "Use ML Compute for macOS build" ON
@ -688,6 +689,10 @@ if(USE_PYTORCH_METAL)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL")
 endif()

+if(USE_PYTORCH_METAL_EXPORT)
+  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_METAL_EXPORT")
+endif()
+
 if(USE_SOURCE_DEBUG_ON_MOBILE)
  string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
--- a/13
+++ b/13
@ -1,7 +1,7 @@
 workspace(name = "pytorch")

 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//tools/rules:workspace.bzl", "new_patched_local_repository")
+load("//tools/rules:workspace.bzl", "new_patched_local_repository", "new_empty_repository")

 http_archive(
    name = "bazel_skylib",
@ -170,3 +170,14 @@ protobuf_deps()
 load("@rules_python//python:repositories.bzl", "py_repositories")

 py_repositories()
+
+local_repository(
+    name = "local_config_cuda",
+    path = "third_party/tensorflow_cuda_bazel_build",
+)
+
+# Wrapper to expose local_config_cuda in an agnostic way
+new_empty_repository(
+    name = "cuda",
+    build_file = "//third_party:cuda.BUILD",
+)
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -167,13 +167,12 @@ else()
 endif()

 # Metal
-if(USE_PYTORCH_METAL)
-  if(APPLE)
-    set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
-  else()
-    # Add files needed from optimized_for_mobile
-    set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
-  endif()
+if(USE_PYTORCH_METAL_EXPORT)
+  # Add files needed from exporting metal models(optimized_for_mobile)
+  set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${metal_prepack_cpp})
+elseif(APPLE AND USE_PYTORCH_METAL)
+  # Compile Metal kernels
+  set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp} ${native_metal_srcs})
 else()
  set(all_cpu_cpp ${all_cpu_cpp} ${metal_cpp})
 endif()
@ -450,13 +449,21 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
+  # Metal
+  if(USE_PYTORCH_METAL_EXPORT)
+    # Add files needed from exporting metal models(optimized_for_mobile)
+    list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
+  elseif(APPLE AND USE_PYTORCH_METAL)
+    # Needed by Metal kernels
+    list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
+  else()
+    list(APPEND INSTALL_HEADERS ${metal_h})
+  endif()
 else()
-  if(USE_PYTORCH_METAL)
-    if(IOS)
+  if(IOS AND USE_PYTORCH_METAL)
      list(APPEND INSTALL_HEADERS ${metal_h} ${native_metal_h})
-    else()
+  else()
      list(APPEND INSTALL_HEADERS ${metal_h} ${metal_prepack_h})
-    endif()
  endif()
 endif()

--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@ -10,7 +10,7 @@ namespace at {
 struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
  // Constructors
  CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
-  ~CPUGeneratorImpl() = default;
+  ~CPUGeneratorImpl() override = default;

  // CPUGeneratorImpl methods
  std::shared_ptr<CPUGeneratorImpl> clone() const;
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -7,6 +7,9 @@
 namespace at {

 static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wrap_scalar=true) {
+  // if dim_post_expr is 0 and wrap_scalar is true, then dim must be in the range [-1, 0].
+  // This is a special case for scalar tensors and manifests in e.g. torch.sum(scalar_tensor, 0)
+  // Otherwise, dim should be in the range [-dim_post_expr, dim_post_expr-1].
  return c10::maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
 }

--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@ -123,7 +123,7 @@ struct BuiltinOpFunction : public Function {
    return *this;
  }

-  ~BuiltinOpFunction() {}
+  ~BuiltinOpFunction() override {}

 private:
  c10::QualifiedName name_;
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -344,6 +344,10 @@ public:
    c10::Dispatcher::singleton().callBoxed(*this, stack);
  }

+  void callBoxed(Stack& stack) const {
+    callBoxed(&stack);
+  }
+
  void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
    c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
  }
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -34,6 +34,9 @@ struct Argument {
        default_value_(std::move(default_value)),
        kwarg_only_(kwarg_only),
        alias_info_(std::move(alias_info)) {
+    // this is an softly-enforced invariant for out arguments.
+    bool is_alias = alias_info_.has_value() && alias_info_.value().isWrite();
+    is_out_ = kwarg_only_ && is_alias;
  }
  const std::string& name() const {
    return name_;
@ -50,6 +53,11 @@ struct Argument {
  bool kwarg_only() const {
    return kwarg_only_;
  }
+
+  bool is_out() const {
+    return is_out_;
+  }
+
  const c10::optional<AliasInfo>& alias_info() const {
    return alias_info_;
  }
@ -116,6 +124,8 @@ struct Argument {
  // is this only specifiable as a keyword argument?
  bool kwarg_only_;
  c10::optional<AliasInfo> alias_info_;
+  // marks if the argument is out variant of the schema
+  bool is_out_;
 };

 inline bool operator==(const Argument& lhs, const Argument& rhs) {
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@ -51,6 +51,16 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
  return out;
 }

+inline size_t findFirstOutArg(const std::vector<Argument>& args) {
+  // find the start of out args in the schema
+  for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) {
+    if (args.at(out_start_idx).is_out()) {
+      return out_start_idx;
+    }
+  }
+  return args.size();
+}
+
 inline bool Argument::isBackwardCompatibleWith(
      const Argument& old,
      std::ostream* why_not) const {
@ -121,17 +131,20 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
    }
  }

-  // Make sure that all the old arguments have their corresponding backward
-  // compatible arguments in this schema.
-  for (size_t i = 0; i < old.arguments().size(); ++i) {
+  // we want to test both out and default args seperately
+  size_t old_out_start_idx = findFirstOutArg(old.arguments());
+  size_t new_out_start_idx = findFirstOutArg(arguments());
+
+  // make sure among the default args, they are backward compatible
+  for (size_t i = 0; i < old_out_start_idx; i++) {
    if (!arguments().at(i).isBackwardCompatibleWith(
          old.arguments().at(i), why_not)) {
      return false;
    }
  }

-  // Validate that all new arguments provided a default value.
-  for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
+  // // Validate that all new arguments provided has a default value
+  for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
    if (!arguments().at(i).default_value()) {
      if (why_not) {
        *why_not
@ -144,6 +157,15 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
    }
  }

+  // now compare the out args
+  for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
+    if (!arguments()
+             .at(i - old_out_start_idx + new_out_start_idx)
+             .isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
+      return false;
+    }
+  }
+
  return true;
 }

--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@ -1,6 +1,9 @@
 #pragma once

+#include <type_traits>
+
 #include <ATen/core/ivalue.h>
+#include <c10/util/Deprecated.h>

 // TODO move this to c10 namespace

@ -9,7 +12,42 @@ namespace jit {

 using c10::IValue;
 using Stack = std::vector<IValue>;
-using Operation = std::function<void(Stack*)>;
+
+class Operation {
+  template <typename F, typename Arg>
+  using accepts = std::is_constructible<std::function<void(Arg)>, F&&>;
+
+ public:
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
+  C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
+  Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
+    raw(&stack);
+  }) {}
+
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack&>::value &&
+                !std::is_same<std::decay_t<F>, Operation>::value, int> = 0>
+  Operation(F&& op): op_(std::forward<F>(op)) {}
+
+  Operation(std::nullptr_t) noexcept {}
+
+  explicit operator bool() const noexcept {
+    return op_ ? true : false;
+  }
+
+  void operator()(Stack& stack) {
+    op_(stack);
+  }
+
+  template <typename T>
+  T* target() noexcept {
+    return op_.target<T>();
+  }
+
+ private:
+  std::function<void(Stack&)> op_;
+};

 // An operation with N inputs and M outputs pops the last N inputs off
 // the stack and pushes its M inputs onto the stack
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@ -459,9 +459,10 @@ TORCH_IMPL_FUNC(nll_loss_backward_out_cpu)

 Tensor cross_entropy_loss_prob_target(
    const Tensor& self,
-    const Tensor& target,
+    const Tensor& target_,
    const Tensor& weight,
-    int64_t reduction) {
+    int64_t reduction,
+    double label_smoothing) {
  const auto n_classes = self.size(1);
  TORCH_CHECK(
      !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes),
@ -472,6 +473,15 @@ Tensor cross_entropy_loss_prob_target(
      weight.sizes());

  auto input = at::log_softmax(self, 1, self.scalar_type());
+  Tensor target;
+
+  if (label_smoothing > 0.0) {
+    TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
+    target = target_ * (1 - label_smoothing) + label_smoothing / n_classes;
+  } else {
+    target = target_;
+  }
+
  if (weight.defined()) {
    // Expand weight to the correct number of dims for broadcasting with input / target
    auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
@ -503,12 +513,66 @@ Tensor cross_entropy_loss_prob_target(
  }
 }

+Tensor cross_entropy_loss_label_smoothing(
+    const Tensor& self,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t reduction,
+    int64_t ignore_index,
+    double label_smoothing) {
+
+    auto input = at::log_softmax(self, 1, self.scalar_type());
+    auto nllloss = at::nll_loss_nd(input, target, weight, reduction, ignore_index);
+
+    auto n_classes = input.size(1);
+
+    Tensor smooth_loss;
+    if (weight.defined()) {
+      // Expand weight to the correct number of dims for broadcasting with input / target
+      auto weight_broadcast_shape = SmallBuffer<int64_t, 5>(input.dim());
+      std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1);
+      weight_broadcast_shape[1] = weight.size(0);
+      Tensor weight_ = weight.view(weight_broadcast_shape);
+
+      smooth_loss = -(input * weight_).sum(1);
+    } else {
+      smooth_loss = -input.sum(1);
+    }
+
+    if (ignore_index >= 0) {
+      smooth_loss.index_put_({target == ignore_index}, 0.0);
+    }
+
+    Tensor ret;
+    switch (reduction) {
+      case Reduction::Mean:
+        if (weight.defined()) {
+          // TODO: This code can path can be removed if #61309 is resolved
+          // loss is normalized by the weights to be consistent with nll_loss_nd
+          ret = smooth_loss.sum() / weight.gather(0, target.flatten()).sum();
+        } else {
+          ret = smooth_loss.mean();
+        }
+        break;
+      case Reduction::Sum:
+        ret = smooth_loss.sum();
+        break;
+      case Reduction::None:
+        ret = smooth_loss;
+        break;
+      default:
+        TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction);
+    }
+    return (1 - label_smoothing) * nllloss + ret * (label_smoothing / n_classes);
+}
+
 Tensor cross_entropy_loss(
    const Tensor& self,
    const Tensor& target,
    const c10::optional<Tensor>& weight,
    int64_t reduction,
-    int64_t ignore_index) {
+    int64_t ignore_index,
+    double label_smoothing) {
  Tensor ret;
  if (self.sizes() == target.sizes()) {
    // Assume soft targets when input and target shapes are the same
@ -519,7 +583,14 @@ Tensor cross_entropy_loss(
    // See [Note: hacky wrapper removal for optional tensor]
    c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
    const Tensor& weight_ = *weight_maybe_owned;
-    ret = cross_entropy_loss_prob_target(self, target, weight_, reduction);
+    ret = cross_entropy_loss_prob_target(self, target, weight_, reduction, label_smoothing);
+  } else if (label_smoothing > 0.0) {
+    TORCH_CHECK(label_smoothing <= 1.0, "label_smoothing must be between 0.0 and 1.0. Got: ", label_smoothing);
+
+    // See [Note: hacky wrapper removal for optional tensor]
+    c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight);
+    const Tensor& weight_ = *weight_maybe_owned;
+    ret = cross_entropy_loss_label_smoothing(self, target, weight_, reduction, ignore_index, label_smoothing);
  } else {
    ret = at::nll_loss_nd(
        at::log_softmax(self, 1, self.scalar_type()),
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@ -1,90 +1,17 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/Parallel.h>
-#include <tuple>
+#include <ATen/native/cpu/MaxUnpoolKernel.h>

 namespace at {
 namespace native {

-template <typename scalar_t>
-Tensor max_unpooling2d_forward_out_cpu_frame(
-    Tensor& output,
-    const Tensor& input,
-    const Tensor& indices,
-    int64_t oheight,
-    int64_t owidth) {
-  int64_t numBatch = 1;
-  int64_t dimc = 0;
-  int64_t dimh = 1;
-  int64_t dimw = 2;
-  if (input.ndimension() == 4) {
-    numBatch = input.size(0);
-    dimc++;
-    dimh++;
-    dimw++;
-  }
-  int64_t numChannels = input.size(dimc);
-  int64_t inputHeight = input.size(dimh);
-  int64_t inputWidth = input.size(dimw);
-
-  auto* rawInput = input.data_ptr<scalar_t>();
-  auto* rawIndices = indices.data_ptr<int64_t>();
-  auto* rawOutput = output.data_ptr<scalar_t>();
-
-  at::internal::lazy_init_num_threads();
-
-  for (int64_t n = 0; n < numBatch; n++) {
-    int64_t nOutputOffset = n * numChannels * owidth * oheight;
-    int64_t nInputOffset = n * numChannels * inputWidth * inputHeight;
-    int64_t k = 0;
-    bool has_error = false;
-    int64_t error_index = 0;
-#pragma omp parallel for private(k)
-    for (k = 0; k < numChannels; k++) {
-      int64_t finalOutputOffset = nOutputOffset + k * owidth * oheight;
-      int64_t finalInputOffset = nInputOffset + k * inputWidth * inputHeight;
-      scalar_t* output_p_k = rawOutput + finalOutputOffset;
-      scalar_t* input_p_k = rawInput + finalInputOffset;
-      int64_t* ind_p_k = rawIndices + finalInputOffset;
-
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int64_t maxp;
-      for (int64_t i = 0; i < inputHeight; i++) {
-        for (int64_t j = 0; j < inputWidth; j++) {
-          maxp = ind_p_k[i * inputWidth + j];
-          if (maxp < 0 || maxp >= owidth * oheight) {
-#pragma omp critical
-            {
-              has_error = true;
-              error_index = maxp;
-            }
-          } else {
-            output_p_k[maxp] = input_p_k[i * inputWidth + j];
-          }
-        }
-      }
-    }
-    if (has_error) {
-      AT_ERROR(
-          "Found an invalid max index: ",
-          error_index,
-          " (output volumes are of size ",
-          oheight,
-          "x",
-          owidth);
-      (void)error_index;
-    }
-  }
-  return output;
-}
-
-Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
+Tensor& max_unpooling2d_forward_out_cpu(
+    const Tensor& self_,
    const Tensor& indices_,
    IntArrayRef output_size,
    Tensor& output) {
  auto oheight = output_size[0];
  auto owidth = output_size[1];
-  TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64");
@ -100,8 +27,9 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,

  TORCH_CHECK(self_.numel() > 0, "Input must be non-empty");

-  auto self = self_.contiguous();
-  auto indices = indices_.contiguous();
+  auto memory_format = self_.suggest_memory_format();
+  auto self = self_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);

  if (self.ndimension() == 3) {
    int64_t numChannels = self.size(0);
@ -109,15 +37,11 @@ Tensor& max_unpooling2d_forward_out_cpu(const Tensor& self_,
  } else {
    int64_t numBatch = self.size(0);
    int64_t numChannels = self.size(1);
-    output.resize_({numBatch, numChannels, oheight, owidth});
+    output.resize_({numBatch, numChannels, oheight, owidth}, memory_format);
  }
  output.zero_();

-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling2d_forward_out_cpu_frame", ([&] {
-        max_unpooling2d_forward_out_cpu_frame<scalar_t>(
-            output, self, indices, oheight, owidth);
-      }));
+  max_unpool2d_kernel(kCPU, output, self, indices);
  return output;
 };

@ -130,87 +54,6 @@ Tensor max_unpooling2d_forward_cpu(
  return output;
 }

-template <typename scalar_t>
-Tensor max_unpooling3d_forward_out_cpu_frame(
-    Tensor& output,
-    const Tensor& input,
-    const Tensor& indices,
-    int64_t oT,
-    int64_t oH,
-    int64_t oW) {
-  int64_t nBatch = 1;
-  int64_t dimw = 3;
-  int64_t dimh = 2;
-  int64_t dimt = 1;
-
-  if (input.ndimension() == 5) {
-    nBatch = input.size(0);
-    dimw++;
-    dimh++;
-    dimt++;
-  }
-
-  int64_t nSlices = input.size(dimt - 1);
-  int64_t iT = input.size(dimt);
-  int64_t iH = input.size(dimh);
-  int64_t iW = input.size(dimw);
-
-  scalar_t* input_data = input.data_ptr<scalar_t>();
-  scalar_t* output_data = output.data_ptr<scalar_t>();
-  int64_t* indices_data = indices.data_ptr<int64_t>();
-
-  at::internal::lazy_init_num_threads();
-
-  for (int64_t p = 0; p < nBatch; p++) {
-    int64_t inputOffset = p * nSlices * iT * iW * iH;
-    int64_t outputOffset = p * nSlices * oT * oW * oH;
-    int64_t k = 0;
-    bool has_error = false;
-    int error_index = 0;
-#pragma omp parallel for private(k)
-    for (k = 0; k < nSlices; k++) {
-      int64_t finalInputOffset = inputOffset + k * iT * iW * iH;
-      int64_t finalOutputOffset = outputOffset + k * oT * oW * oH;
-
-      scalar_t* output_p_k = output_data + finalOutputOffset;
-      scalar_t* input_p_k = input_data + finalInputOffset;
-      int64_t* ind_p_k = indices_data + finalInputOffset;
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int maxp;
-      for (int64_t t = 0; t < iT; t++) {
-        for (int64_t i = 0; i < iH; i++) {
-          for (int64_t j = 0; j < iW; j++) {
-            int64_t index = t * iH * iW + i * iW + j;
-            maxp = ind_p_k[index];
-            if (maxp < 0 || maxp >= oT * oW * oH) {
-#pragma omp critical
-              {
-                has_error = true;
-                error_index = maxp;
-              }
-            } else {
-              output_p_k[maxp] = input_p_k[index];
-            }
-          }
-        }
-      }
-      if (has_error) {
-        AT_ERROR(
-            "found an invalid max index ",
-            error_index,
-            " (output volumes are of size ",
-            oT,
-            "x",
-            oH,
-            "x",
-            oW);
-        (void)error_index;
-      }
-    }
-  }
-  return output;
-}
-
 static void max_unpooling3d_shape_check(
    const Tensor& input,
    const Tensor& gradOutput,
@ -310,16 +153,7 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
  }
  output.zero_();

-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling3d_forward_out_cpu_frame", ([&] {
-        max_unpooling3d_forward_out_cpu_frame<scalar_t>(
-            output,
-            self,
-            indices,
-            oT,
-            oH,
-            oW);
-      }));
+  max_unpool3d_kernel(kCPU, output, self, indices);
  return output;
 }

@ -335,59 +169,6 @@ Tensor max_unpooling3d_forward_cpu(
  return output;
 }

-template <typename scalar_t>
-static void max_unpooling2d_backward_out_cpu_frame(
-    scalar_t* gradInput_p,
-    scalar_t* gradOutput_p,
-    int64_t* ind_p,
-    int64_t nslices,
-    int64_t iheight,
-    int64_t iwidth,
-    int64_t oheight,
-    int64_t owidth) {
-  bool has_error = false;
-  int64_t error_index = 0;
-  int64_t k = 0;
-
-  at::internal::lazy_init_num_threads();
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++) {
-    scalar_t* gradInput_p_k = gradInput_p + k * iwidth * iheight;
-    scalar_t* gradOutput_p_k = gradOutput_p + k * owidth * oheight;
-    int64_t* ind_p_k = ind_p + k * iwidth * iheight;
-
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t i, j;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t maxp;
-
-    for (i = 0; i < iheight; i++) {
-      for (j = 0; j < iwidth; j++) {
-        maxp = ind_p_k[i * iwidth + j]; /* retrieve position of max */
-        if (maxp < 0 || maxp >= owidth * oheight) {
-#pragma omp critical
-          {
-            has_error = true;
-            error_index = maxp;
-          }
-        }
-        gradInput_p_k[i * iwidth + j] =
-            gradOutput_p_k[maxp]; /* update gradient */
-      }
-    }
-  }
-  if (has_error) {
-    AT_ERROR(
-        "invalid max index ",
-        error_index,
-        ", owidth= ",
-        owidth,
-        ", oheight= ",
-        oheight);
-    (void)error_index;
-  }
-}
-
 Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
    const Tensor& self,
    const Tensor& indices_,
@ -396,42 +177,24 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
  int64_t oheight = output_size[0];
  int64_t owidth = output_size[1];
-  int dimw = 2;
-  int dimh = 1;
-  int nbatch = 1;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int nslices;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iheight;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iwidth;
+  int64_t ndim = self.ndimension();
+  int64_t dimh = ndim == 3 ? 1 : 2;
+  int64_t dimw = ndim == 3 ? 2 : 3;
+
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64");
  TORCH_CHECK(
      self.sizes() == indices_.sizes(), "Input shape must match indices shape");
-
  TORCH_CHECK(output_size.size() == 2, "Output size must be 2");

-  /* get contiguous gradOutput and indices */
-  auto grad_output = grad_output_.contiguous();
-  auto indices = indices_.contiguous();
+  auto memory_format = self.suggest_memory_format();
+  auto grad_output = grad_output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);

-  /* resize */
-  grad_input.resize_as_(self);
+  grad_input.resize_(self.sizes(), memory_format);
  grad_input.zero_();

-  if (self.ndimension() == 4) {
-    nbatch = self.size(0);
-    dimw++;
-    dimh++;
-  }
-
-  /* sizes */
-  nslices = self.size(dimh - 1);
-  iheight = self.size(dimh);
-  iwidth = self.size(dimw);
-
  if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) {
    AT_ERROR(
        "Inconsistent gradOutput size. output height = ",
@ -443,23 +206,8 @@ Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_,
        "x",
        grad_output.size(dimw));
  }
-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling2d_backward_out_cpu_frame", ([&] {
-        int p;
-        for (p = 0; p < nbatch; p++) {
-          auto inputOffset = p * nslices * iheight * iwidth;
-          auto outputOffset = p * nslices * oheight * owidth;
-          max_unpooling2d_backward_out_cpu_frame<scalar_t>(
-              grad_input.data_ptr<scalar_t>() + inputOffset,
-              grad_output.data_ptr<scalar_t>() + outputOffset,
-              indices.data_ptr<int64_t>() + inputOffset,
-              nslices,
-              iheight,
-              iwidth,
-              oheight,
-              owidth);
-        }
-      }));
+
+  max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices);
  return grad_input;
 }

@ -468,72 +216,14 @@ Tensor max_unpooling2d_backward_cpu(
    const Tensor& self,
    const Tensor& indices,
    IntArrayRef output_size) {
-  auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  at::native::max_unpooling2d_backward_out_cpu(
+  auto grad_input = at::empty({0}, self.options());
+  max_unpooling2d_backward_out_cpu(
      grad_output, self, indices, output_size, grad_input);
  return grad_input;
 }

-template <typename scalar_t>
-static void max_unpooling3d_backward_out_cpu_frame(
-    scalar_t* gradInput_p,
-    scalar_t* gradOutput_p,
-    int64_t* ind_p,
-    int64_t nslices,
-    int64_t iT,
-    int64_t iH,
-    int64_t iW,
-    int64_t oT,
-    int64_t oH,
-    int64_t oW) {
-  int64_t k = 0;
-  bool has_error = false;
-  int error_index = 0;
-
-  at::internal::lazy_init_num_threads();
-
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++) {
-    scalar_t* gradInput_p_k = gradInput_p + k * iT * iH * iW;
-    scalar_t* gradOutput_p_k = gradOutput_p + k * oT * oH * oW;
-    int64_t* ind_p_k = ind_p + k * iT * iH * iW;
-
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t t, i, j, index;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t maxp;
-    for (t = 0; t < iT; t++) {
-      for (i = 0; i < iH; i++) {
-        for (j = 0; j < iW; j++) {
-          index = t * iH * iW + i * iW + j;
-          maxp = ind_p_k[index]; /* retrieve position of max */
-          if (maxp < 0 || maxp >= oT * oH * oW) {
-#pragma omp critical
-            {
-              has_error = true;
-              error_index = maxp;
-            }
-          }
-          gradInput_p_k[index] = gradOutput_p_k[maxp]; /* update gradient */
-        }
-      }
-    }
-  }
-  if (has_error) {
-    AT_ERROR(
-        "invalid max index ",
-        error_index,
-        ", oT= ",
-        oT,
-        ", oW= ",
-        oW,
-        ",oH= ",
-        oH);
-    (void)error_index;
-  }
-}
-
-Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
+Tensor& max_unpooling3d_backward_out_cpu(
+    const Tensor& grad_output_,
    const Tensor& self,
    const Tensor& indices_,
    IntArrayRef output_size,
@ -541,26 +231,17 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
    IntArrayRef padding,
    Tensor& grad_input) {
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  auto oT = output_size[0];
-  auto oH = output_size[1];
-  auto oW = output_size[2];
-  int dimw = 3;
-  int dimh = 2;
-  int dimt = 1;
-  int nbatch = 1;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int nslices;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iT;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iH;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int iW;
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+  int64_t ndim = self.ndimension();
+  int64_t dimt = ndim == 4 ? 1 : 2;
+  int64_t dimh = ndim == 4 ? 2 : 3;
+  int64_t dimw = ndim == 4 ? 3 : 4;

  max_unpooling3d_shape_check(
      self, grad_output_, indices_, output_size, stride, padding);

-  // TODO (from THNN): check gradOutput shape
  /* get contiguous gradOutput */
  auto grad_output = grad_output_.contiguous();
  auto indices = indices_.contiguous();
@ -568,39 +249,24 @@ Tensor& max_unpooling3d_backward_out_cpu(const Tensor& grad_output_,
  /* resize */
  grad_input.resize_as_(self);
  grad_input.zero_();
-  if (self.ndimension() == 5) {
-    nbatch = self.size(0);
-    dimt++;
-    dimw++;
-    dimh++;
+
+  if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) {
+    AT_ERROR(
+        "Inconsistent gradOutput size. output depth = ",
+        oT,
+        ", output height = ",
+        oH,
+        ", output width = ",
+        oW,
+        ", gradOutput: ",
+        grad_output.size(dimt),
+        "x",
+        grad_output.size(dimh),
+        "x",
+        grad_output.size(dimw));
  }

-  /* sizes */
-  nslices = self.size(dimt - 1);
-  iT = self.size(dimt);
-  iH = self.size(dimh);
-  iW = self.size(dimw);
-
-  /* backprop */
-  AT_DISPATCH_FLOATING_TYPES(
-      self.scalar_type(), "max_unpooling3d_backward_out_cpu_frame", ([&] {
-        int p;
-        for (p = 0; p < nbatch; p++) {
-          int inputOffset = p * nslices * iT * iH * iW;
-          int outputOffset = p * nslices * oT * oH * oW;
-          max_unpooling3d_backward_out_cpu_frame<scalar_t>(
-              grad_input.data_ptr<scalar_t>() + inputOffset,
-              grad_output.data_ptr<scalar_t>() + outputOffset,
-              indices.data_ptr<int64_t>() + inputOffset,
-              nslices,
-              iT,
-              iH,
-              iW,
-              oT,
-              oH,
-              oW);
-        }
-      }));
+  max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices);
  return grad_input;
 }

@ -611,10 +277,16 @@ Tensor max_unpooling3d_backward_cpu(
    IntArrayRef output_size,
    IntArrayRef stride,
    IntArrayRef padding) {
-  auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_input = at::empty({0}, self.options());
  at::native::max_unpooling3d_backward_out_cpu(
      grad_output, self, indices, output_size, stride, padding, grad_input);
  return grad_input;
 }
+
+DEFINE_DISPATCH(max_unpool2d_kernel);
+DEFINE_DISPATCH(max_unpool2d_backward_kernel);
+DEFINE_DISPATCH(max_unpool3d_kernel);
+DEFINE_DISPATCH(max_unpool3d_backward_kernel);
+
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -1209,12 +1209,15 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)

  if (dim < sparse_dim) {

-    auto dim_indices = indices[dim];
+    auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous();
+    int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr<int64_t>();
+    auto cpu_index = index.to(c10::kCPU).contiguous();
+    int64_t* cpu_index_ptr = cpu_index.data_ptr<int64_t>();
    std::vector<int64_t> zindices;
    std::vector<int64_t> iindices;
    int64_t new_nnz = 0;
-    for (const auto i : c10::irange(new_sizes[dim])) {
-      auto idx = index[i].item<int64_t>();
+    for (int64_t i = 0; i < new_sizes[dim]; i++) {
+      int64_t idx = cpu_index_ptr[i];
      if (idx < -size || idx >= size) {
        TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ",
                   self.sizes(), " at dimension ", dim);
@ -1222,8 +1225,8 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
      if (idx < 0) {
        idx += size;
      }
-      for (const auto j : c10::irange(nnz)) {
-        auto jdx = dim_indices[j].item<int64_t>();
+      for (int64_t j = 0; j < nnz; j++) {
+        int64_t jdx = cpu_dim_indices_ptr[j];
        if (idx == jdx) {
          new_nnz++;
          iindices.push_back(i);
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@ -251,12 +251,16 @@ static inline scalar_t area_pixel_compute_scale(
    bool align_corners,
    const c10::optional<double> scale) {
  // see Note [area_pixel_compute_scale]
-  if (output_size > 1) {
-    return align_corners
-        ? static_cast<scalar_t>(input_size - 1) / (output_size - 1)
-        : compute_scales_value<scalar_t>(scale, input_size, output_size);
-  } else {
-    return scalar_t(0);
+  if(align_corners){
+    if(output_size > 1) {
+      return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
+    }
+    else {
+      return static_cast<scalar_t>(0);
+    }
+  }
+  else{
+    return compute_scales_value<scalar_t>(scale, input_size, output_size);
  }
 }

--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@ -0,0 +1,385 @@
+#include <ATen/ATen.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/cpu/utils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+template <typename scalar_t, bool is_3d = false>
+void cpu_max_unpool(
+    Tensor& output_,
+    const Tensor& input,
+    const Tensor& indices) {
+  auto output = output_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  // NB: input tensor dimensions:
+  // MaxUnpool2d:
+  //    dim = 3: CHW
+  //    dim = 4: NCHW
+  // MaxUnpool3d:
+  //    dim = 4: CDHW
+  //    dim = 5: NCDHW
+
+  int64_t numel = input.numel();
+  int64_t ndim = input.ndimension();
+
+  // treat batch size and channels as one dimension
+  // and the feature map as another dimension
+  int64_t channels, output_depth, output_height, output_width;
+  if (is_3d) {
+    TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d: expect input to be 4d or 5d tensor.");
+    channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+    output_depth = output.size(-3);
+    output_height = output.size(-2);
+    output_width = output.size(-1);
+  } else {
+    TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d: expect input to be 3d or 4d tensor.");
+    channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1);
+    output_depth = 1;
+    output_height = output.size(-2);
+    output_width = output.size(-1);
+  }
+  int64_t input_image_size = numel / channels;
+  int64_t output_image_size = output.numel() / channels;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, C, D, H, W: [channels, input_image_size]
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t ip = 0;
+    data_index_init(begin, c, channels, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* output_ptr = output_data + c * output_image_size;
+
+      int64_t maxp = indices_data[i];
+      if (maxp < 0 || maxp >= output_image_size) {
+        #pragma omp critical
+        {
+          has_error = true;
+          error_index = maxp;
+        }
+      } else {
+        output_ptr[maxp] = input_data[i];
+      }
+
+      // move on to next input index
+      data_index_step(c, channels, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    if (is_3d) {
+      AT_ERROR("Found an invalid max index: ", error_index,
+          " (output volumes are of size ", output_depth,
+          "x", output_height, "x", output_width);
+      (void)error_index;
+    } else {
+      AT_ERROR("Found an invalid max index: ", error_index,
+          " (output volumes are of size ", output_height,
+          "x", output_width);
+      (void)error_index;
+    }
+  }
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_unpool_channels_last(
+    Tensor& output_,
+    const Tensor& input,
+    const Tensor& indices) {
+  TORCH_CHECK(input.ndimension() == 4,
+              "max_unpool2d with channels last format supports tensors with 4 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast;
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_height = input.size(2);
+  int64_t input_width = input.size(3);
+  int64_t output_height = output.size(2);
+  int64_t output_width = output.size(3);
+  int64_t input_image_size = input_height * input_width;
+  int64_t output_image_size = output_height * output_width;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t ip = 0;
+    data_index_init(begin, n, nbatch, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* input_ptr = input_data + i * channels;
+      int64_t* indices_ptr = indices_data + i * channels;
+      scalar_t* output_ptr = output_data + n * output_image_size * channels;
+
+      // can't do scatter on avx2 (only available on avx512)
+      for (int64_t c = 0; c < channels; c++) {
+        int64_t maxp = indices_ptr[c];
+        if (maxp < 0 || maxp >= output_image_size) {
+          #pragma omp critical
+          {
+            has_error = true;
+            error_index = maxp;
+          }
+        } else {
+          output_ptr[maxp * channels + c] = input_ptr[c];
+        }
+      }
+
+      // move on to next input index
+      data_index_step(n, nbatch, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    AT_ERROR("Found an invalid max index: ", error_index,
+        " (output volumes are of size ", output_height,
+        "x", output_width);
+    (void)error_index;
+  }
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t, bool is_3d = false>
+void cpu_max_unpool_backward(
+    Tensor& grad_input_,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+
+  int64_t numel = grad_input.numel();
+  int64_t ndim = grad_output.ndimension();
+
+  // treat batch size and channels as one dimension
+  // and the feature map as another dimension
+  int64_t channels, output_depth, output_height, output_width;
+  if (is_3d) {
+    TORCH_CHECK(ndim == 4 || ndim == 5, "MaxUnpool3d_backward: expect grad_output to be 4d or 5d tensor.");
+    channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+    output_depth = grad_output.size(-3);
+    output_height = grad_output.size(-2);
+    output_width = grad_output.size(-1);
+  } else {
+    TORCH_CHECK(ndim == 3 || ndim == 4, "MaxUnpool2d_backward: expect grad_output to be 3d or 4d tensor.");
+    channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+    output_depth = 1;
+    output_height = grad_output.size(-2);
+    output_width = grad_output.size(-1);
+  }
+  int64_t input_image_size = numel / channels;
+  int64_t output_image_size = grad_output.numel() / channels;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, C, D, H, W
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t ip = 0;
+    data_index_init(begin, c, channels, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* grad_output_ptr = grad_output_data + c * output_image_size;
+
+      int64_t maxp = indices_data[i];
+      if (maxp < 0 || maxp >= output_image_size) {
+        #pragma omp critical
+        {
+          has_error = true;
+          error_index = maxp;
+        }
+      } else {
+        grad_input_data[i] = grad_output_ptr[maxp];
+      }
+
+      // move on to next input index
+      data_index_step(c, channels, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    if (is_3d) {
+      AT_ERROR("invalid max index ", error_index,
+          ", odepth= ", output_depth,
+          ", owidth= ", output_width,
+          ", oheight= ", output_height);
+      (void)error_index;
+    } else {
+      AT_ERROR("invalid max index ", error_index,
+          ", owidth= ", output_width,
+          ", oheight= ", output_height);
+      (void)error_index;
+    }
+  }
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_max_unpool_backward_channels_last(
+    Tensor& grad_input_,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  TORCH_CHECK(grad_output.ndimension() == 4,
+      "max_unpool2d  backward with channels last format supports tensors with 4 dims.");
+  auto memory_format = at::MemoryFormat::ChannelsLast;
+  auto grad_input = grad_input_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_height = grad_input.size(2);
+  int64_t input_width = grad_input.size(3);
+  int64_t output_height = grad_output.size(2);
+  int64_t output_width = grad_output.size(3);
+  int64_t input_image_size = input_height * input_width;
+  int64_t output_image_size = output_height * output_width;
+
+  bool has_error = false;
+  int64_t error_index = 0;
+
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t ip = 0;
+    data_index_init(begin, n, nbatch, ip, input_image_size);
+
+    for (int64_t i = begin; i < end; i++) {
+      scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels;
+      scalar_t* grad_input_ptr = grad_input_data + i * channels;
+      int64_t* indices_ptr = indices_data + i * channels;
+
+      for (int64_t c = 0; c < channels; c++) {
+        int64_t maxp = indices_ptr[c];
+        if (maxp < 0 || maxp >= output_image_size) {
+          #pragma omp critical
+          {
+            has_error = true;
+            error_index = maxp;
+          }
+        } else {
+          grad_input_ptr[c] = grad_output_ptr[maxp * channels + c];
+        }
+      }
+
+      // move on to next input index
+      data_index_step(n, nbatch, ip, input_image_size);
+    }
+  });
+
+  if (has_error) {
+    AT_ERROR("invalid max index ", error_index,
+        ", owidth= ", output_width,
+        ", oheight= ", output_height);
+    (void)error_index;
+  }
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+void max_unpool2d_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    const Tensor& indices) {
+  switch(input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d", [&] {
+        cpu_max_unpool<scalar_t, /*is_3d*/false>(output, input, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool2d_channels_last", [&] {
+        cpu_max_unpool_channels_last<scalar_t>(output, input, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void max_unpool3d_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    const Tensor& indices) {
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_unpool3d", [&] {
+    cpu_max_unpool<scalar_t, /*is_3d*/true>(output, input, indices);
+  });
+}
+
+void max_unpool2d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  switch(grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] {
+        cpu_max_unpool_backward<scalar_t, /*is_3d*/false>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] {
+        cpu_max_unpool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void max_unpool3d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] {
+    cpu_max_unpool_backward<scalar_t, /*is_3d*/true>(grad_input, grad_output, indices);
+  });
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl);
+REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl);
+REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl);
+REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl);
+
+}} // at::native
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h
@ -0,0 +1,16 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/DispatchStub.h>
+
+#pragma once
+
+namespace at { namespace native {
+
+using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel);
+
+}} // at::native
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -163,24 +163,29 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t
 }

 static void prod_kernel_impl(TensorIterator& iter) {
-  // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
+  // Workaround for the error: '*' in boolean context, suggest '&&' instead
+  // [-Werror=int-in-bool-context]
  if (iter.dtype() == ScalarType::Bool) {
    using scalar_t = bool;
    binary_kernel_reduce_vec(
-      iter,
-      [=](scalar_t a, scalar_t b) -> scalar_t { return a && b; },
-      [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return a && b; },
-      // NOLINTNEXTLINE(bugprone-argument-comment)
-      /*identity=*/1);
+        iter,
+        [=](scalar_t a, scalar_t b)
+            __ubsan_ignore_undefined__ -> scalar_t { return a && b; },
+        [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
+            __ubsan_ignore_undefined__ { return a && b; },
+        // NOLINTNEXTLINE(bugprone-argument-comment)
+        /*identity=*/1);
  } else {
    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
      binary_kernel_reduce_vec(
-        iter,
-        [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
-        [=](Vectorized <scalar_t> a, Vectorized <scalar_t> b) { return a * b; },
-        // NOLINTNEXTLINE(bugprone-argument-comment)
-        /*identity=*/1);
-      });
+          iter,
+          [=](scalar_t a, scalar_t b)
+              __ubsan_ignore_undefined__ -> scalar_t { return a * b; },
+          [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
+              __ubsan_ignore_undefined__ { return a * b; },
+          // NOLINTNEXTLINE(bugprone-argument-comment)
+          /*identity=*/1);
+    });
  }
 }

--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@ -648,7 +648,9 @@ Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, c
  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
  const Tensor& weight = *weight_maybe_owned;

-  if (at::cuda::detail::canUse32BitIndexMath(self) && batch_norm_use_channels_last_kernels(self)){
+  if (at::cuda::detail::canUse32BitIndexMath(self) &&
+      batch_norm_use_channels_last_kernels(self) &&
+      batch_norm_use_channels_last_kernels(input))  {
    return batch_norm_backward_elemt_channels_last_cuda_template(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
  }

--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@ -1649,7 +1649,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
  const auto stride = input.sizes()[1];
  const auto reduction_size = input.numel() / stride;

-  at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);

  dim3 block;
  dim3 grid;
@ -1716,7 +1717,8 @@ at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
  const auto reduction_size = input.numel() / stride;
  auto norm_fct = 1.0 / reduction_size;

-  at::Tensor grad_input = at::empty_like(input, input.suggest_memory_format());
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);

  dim3 block;
  dim3 grid;
--- a/aten/src/ATen/native/cuda/Sort.cu
+++ b/aten/src/ATen/native/cuda/Sort.cu
@ -207,6 +207,87 @@ struct offset_t {

 }

+namespace {
+
+// Segmented sort by full sort algorithm:.
+// Say we are sorting a (2, 3) tensor. We have in flattened form:
+// values       0.4 1.2 5.3 6.2 1.3 2.3
+// indices        0   1   2   0   1   2
+// segment_id     0   0   0   1   1   1
+
+// First we sort by values, globally:
+// values       6.2 5.3 2.3 1.2 1.3 0.4
+// indices        0   2   2   1   1   0
+// segment_id     1   0   1   0   1   0
+
+// Then we stable sort by segment id:
+// values       5.3 1.2 0.4 6.2 2.3 1.3
+// indices        2   1   0   0   2   1
+// segment_id     0   0   0   1   1   1
+
+// This method can only work if the slice we are sorting (`dim`) is
+// innermost, and both values and indices are contiguous. We do this
+// by re-arranging the input into this form as needed, which will
+// unfortunately allocate memory if the request is not in this form.
+// Vectorized sort is slower than iterated sort if the number of
+// slices is small (since we're sorting twice, instead of invoking a
+// smaller sort `numSlices` times), but the cub sort
+// implementation here is a catch-all, so we're not looking for
+// efficiency, but instead correctness.
+
+template<typename scalar_t>
+__global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64_t *index, const int2 *i_s_ptr, int nsegments, int nsort) {
+  CUDA_KERNEL_LOOP(i, nsegments * nsort) {
+    int segment = i / nsort;
+    int j = i % nsort;
+
+    int offset = segment * nsort;
+    const scalar_t *in_ = in + offset;
+    scalar_t *out_ = out + offset;
+    int64_t *index_ = index + offset;
+    const int2 *i_s_ptr_ = i_s_ptr + offset;
+
+    int idx = i_s_ptr_[j].y;
+    index_[j] = idx;
+    out_[j] = in_[idx];
+  }
+}
+
+template<typename scalar_t>
+inline void segmented_sort_pairs_by_full_sort(
+  int64_t nsegments, int64_t nsort, int64_t n, bool descending, const Tensor &indices,
+  const scalar_t *self_ptr, scalar_t *values_ptr, int64_t *indices_ptr
+) {
+  int64_t segment_bits = std::max<int64_t>(1L, static_cast<int64_t>(std::ceil(std::log2(nsegments))));
+
+  auto int_options = indices.options().dtype(kInt);
+  auto indices_and_segment = at::empty({nsegments, nsort, 2}, int_options);
+  indices_and_segment.select(-1, 0).copy_(  // segment id
+    at::arange(nsegments, int_options).view({nsegments, 1}).expand({nsegments, nsort}));
+  indices_and_segment.select(-1, 1).copy_(  // reverse indices
+    at::arange(nsort, int_options).view({1, nsort}).expand({nsegments, nsort}));
+
+  auto i_s_ptr = reinterpret_cast<int2 *>(indices_and_segment.data_ptr<int>());
+  auto indices_and_segment2 = at::empty_like(indices_and_segment);
+  auto i_s_ptr2 = reinterpret_cast<int2 *>(indices_and_segment2.data_ptr<int>());
+
+  at::cuda::cub::sort_pairs<scalar_t, int2>(
+    self_ptr, nullptr, i_s_ptr, i_s_ptr2,
+    n, descending);
+
+  TORCH_INTERNAL_ASSERT(segment_bits <= 32);
+
+  // sort on lower 32bits, i.e. segment index
+  at::cuda::cub::sort_keys<int64_t>(
+    reinterpret_cast<int64_t *>(i_s_ptr2), reinterpret_cast<int64_t *>(i_s_ptr),
+    n, false, 0, segment_bits);
+
+  sort_postprocess_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+    self_ptr, values_ptr, indices_ptr, i_s_ptr, nsegments, nsort);
+}
+
+}  // namespace
+
 // We perform a segmented sort in cub with inputs that have
 // more than 1024/2048 elements along the selected dimension.
 // Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace).
@ -349,11 +430,15 @@ std::tuple<Tensor &,Tensor &> sort_out_stable_cuda(const Tensor & self, c10::opt
        int64_t n = std::min(remaining, nbatch);
        int64_t nsegments = n / nsort;

-        auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
-
-        at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
-          reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
-          offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
+        if (nsegments < 128) {
+          segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending,
+            indices, self_ptr, values_ptr, indices_ptr);
+        } else {
+          auto reverse_indices = at::arange(nsort, indices.options()).view({1, nsort}).expand({nsegments, nsort}).contiguous();
+          at::cuda::cub::segmented_sort_pairs(self_ptr, values_ptr,
+            reverse_indices.data_ptr<int64_t>(), indices_ptr, n, nsegments,
+            offset_t{(int)nsort, 0}, offset_t{(int)nsort, 1}, descending);
+        }

        remaining -= n;
        self_ptr += n;
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@ -94,11 +94,16 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
    int output_size,
    bool align_corners,
    const c10::optional<double> scale) {
-  if (output_size > 1) {
-    return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1)
-                         :  compute_scales_value<accscalar_t>(scale, input_size, output_size);
-  } else {
-    return static_cast<accscalar_t>(0);
+  if(align_corners) {
+    if(output_size > 1) {
+      return (accscalar_t)(input_size - 1) / (output_size - 1);
+    }
+    else {
+      return static_cast<accscalar_t>(0);
+    }
+  }
+  else{
+    return compute_scales_value<accscalar_t>(scale, input_size, output_size);
  }
 }

--- a/aten/src/ATen/native/metal/MetalShaders.h
+++ b/aten/src/ATen/native/metal/MetalShaders.h
@ -393,31 +393,32 @@ kernel void clamp(texture2d_array<half, access::read> in_arr[[texture(0), functi
    }
 }

-kernel void hardswish(texture2d_array<half, access::read> in[[texture(0)]],
-                      texture2d_array<half, access::write> out[[texture(1)]],
+constant bool hardswish_is_arr = (ushort_arg_0 > 1 || ushort_arg_1 > 4);
+constant bool hardswish_is_tex = !hardswish_is_arr;
+kernel void hardswish(texture2d_array<half, access::read> in_arr[[texture(0), function_constant(hardswish_is_arr)]],
+                      texture2d<half, access::read> in_tex[[texture(0), function_constant(hardswish_is_tex)]],
+                      texture2d_array<half, access::write> out_arr[[texture(1), function_constant(hardswish_is_arr)]],
+                      texture2d<half, access::write> out_tex[[texture(1), function_constant(hardswish_is_tex)]],
                      ushort3 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+    const ushort oH = ushort_arg_2;
+    const ushort oW = ushort_arg_3;
+    if (gid.x >= oW || gid.y >= oH) {
        return;
    }
    ushort2 gid_ = gid.xy;
-    half4 value = in.read(gid_, gid.z);
-    half4 mask1 = half4(value < 3.0);
-    half4 mask2 = half4(value > -3.0);
-    half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
-    out.write(outval, gid_, gid.z);
-}
-
-kernel void hardswish_nonarray(texture2d<half, access::read> in[[texture(0)]],
-                               texture2d<half, access::write> out[[texture(1)]],
-                               ushort2 gid[[thread_position_in_grid]]) {
-    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
-        return;
+    if (hardswish_is_arr) {
+      half4 value = in_arr.read(gid_, gid.z);
+      half4 mask1 = half4(value < 3.0);
+      half4 mask2 = half4(value > -3.0);
+      half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
+      out_arr.write(outval, gid_, gid.z);
+    } else {
+      half4 value = in_tex.read(gid_);
+      half4 mask1 = half4(value < 3);
+      half4 mask2 = half4(value > -3.0);
+      half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
+      out_tex.write(outval, gid_);
    }
-    half4 value = in.read(gid);
-    half4 mask1 = half4(value < 3);
-    half4 mask2 = half4(value > -3.0);
-    half4 outval = mask2*(mask1*(value*(value + 3.0)/6.0) + (1 - mask1)*value);
-    out.write(outval, gid);
 }

 constant bool out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4);
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.h
@ -41,6 +41,7 @@ bool test_softmax();
 bool test_sigmoid();
 bool test_hardsigmoid();
 bool test_hardswish();
+bool test_hardswish2();
 bool test_upsampling_nearest2d_vec();
 bool test_upsampling_nearest2d_vec2();
 bool test_adaptive_avg_pool2d();
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
@ -262,6 +262,18 @@ bool test_hardswish() {
  });
 }

+bool test_hardswish2() {
+  __block std::vector<int64_t> size{1, 3, 44, 44};
+  return TEST(size, __PRETTY_FUNCTION__, ^bool {
+    auto X =
+        at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat)) * 12 - 6;
+    auto X2 = X.metal();
+    auto Y1 = at::hardswish_(X);
+    auto Y2 = at::hardswish_(X2).cpu();
+    return almostEqual(Y1, Y2);
+  });
+}
+
 bool test_addmm() {
  bool result = true;
  for (int i = 0; i < ITER_COUNT; ++i) {
--- a/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MetalOpTestRunner.mm
@ -69,6 +69,7 @@
  REG_TEST("test_sigmoid", test_sigmoid);
  REG_TEST("test_hardsigmoid", test_hardsigmoid);
  REG_TEST("test_hardswish", test_hardswish);
+  REG_TEST("test_hardswish2", test_hardswish2);
  REG_TEST("test_upsampling_nearest2d_vec", test_upsampling_nearest2d_vec);
  REG_TEST("test_upsampling_nearest2d_vec2", test_upsampling_nearest2d_vec2);
  REG_TEST("test_adaptive_avg_pool2d", test_adaptive_avg_pool2d);
--- a/aten/src/ATen/native/metal/ops/MetalHardswish.mm
+++ b/aten/src/ATen/native/metal/ops/MetalHardswish.mm
@ -24,9 +24,9 @@ Tensor& hardswish_(Tensor& input) {
  id<MTLComputeCommandEncoder> encoder =
      [commandBuffer.buffer computeCommandEncoder];
  id<MTLComputePipelineState> state = [[MetalContext sharedInstance]
-      specializedPipelineState:mpscnn::kernelFor(
-                                   X, "hardswish", "hardswish_nonarray")
+      specializedPipelineState:"hardswish"
                     Constants:@[
+                       @(X.numberOfImages),
                       @(X.featureChannels),
                       @(X.height),
                       @(X.width)
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -6652,7 +6652,7 @@
  device_check: NoCheck   # TensorIterator
  variants: method

- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor
+- func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, float label_smoothing=0.0) -> Tensor
  python_module: nn

 - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@ -1,3 +1,5 @@
+#include <ATen/native/quantized/cpu/qembeddingbag_prepack.h>
+
 #include <c10/core/ScalarType.h>
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
@ -122,7 +124,6 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(

 namespace at {
 namespace native {
-namespace {

 // Note - This is a temporary pack function for embedding bag which quantizes
 // and packs the float weight tensor. In the next step it will be replaced by a
@ -184,7 +185,7 @@ namespace {
 //
 //        [[50.        , 60.00000035],
 //         [70.        , 80.00000035]]])
-Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
  // The "last" dimension of an N-Dimensioned batch of embedding bags is
  // quantization channel. E.g. for a 2D embedding bag, this has
  // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@ -208,17 +209,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
  const int32_t embedding_cols = weight_sizes[cols_dim];
  // Add 8 bytes per column to store FP32 scale and zero_point per row.
  const int32_t output_columns = embedding_cols + 2 * sizeof(float);
-  Tensor weight_contig = weight.contiguous(weight.suggest_memory_format());
+  const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());

  // Adjust output dimensions to account for FP32 scale and zero_points.
  std::vector<int64_t> output_shape = weight_sizes.vec();
  output_shape[cols_dim] = output_columns;
-
-  // Allocate output packed weights
-  auto output = at::empty(
-      output_shape,
-      weight_contig.options().dtype(at::kByte),
-      weight_contig.suggest_memory_format());
+  at::native::resize_(output, output_shape, c10::nullopt);
  auto* output_data = output.data_ptr<uint8_t>();

 #ifdef USE_FBGEMM
@ -246,10 +242,9 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
  }

 #else
-  const auto float_weight = weight_contig.scalar_type() == at::ScalarType::Half
-    ? weight_contig.to(at::ScalarType::Float)
-    : weight_contig;
-  const auto weight_data = float_weight.data_ptr<float>();
+  const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half
+    ? weight_contig->to(at::ScalarType::Float).data_ptr<float>()
+    : weight_contig->data_ptr<float>();
  constexpr float kEpsilon = 1e-8f;
  for (auto row: c10::irange(embedding_rows)) {
    const float* input_row = weight_data + row * embedding_cols;
@ -276,6 +271,21 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
  return output;
 }

+Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
+  const auto weight_contig = weight.expect_contiguous(weight.suggest_memory_format());
+  auto output = at::detail::empty_cpu(
+      {0},
+      at::kByte,
+      weight_contig->layout(),
+      weight_contig->device(),
+      c10::nullopt,
+      c10::nullopt);
+  qembeddingbag_byte_prepack_out(output, weight);
+  return output;
+}
+
+namespace {
+
 // TODO: Extend support to N-D batched embeddings, similar to qembeddingbag_byte_prepack
 Tensor _qembeddingbag_nbit_prepack_helper(
    const Tensor& weight,
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@ -0,0 +1,11 @@
+#include <ATen/ATen.h>
+
+namespace at {
+namespace native {
+
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
+
+Tensor qembeddingbag_byte_prepack(const Tensor& weight);
+
+} // namespace native
+} // namespace at
--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@ -11,7 +11,7 @@
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>

 #include <THC/THCThrustAllocator.cuh>

--- a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
@ -0,0 +1,74 @@
+/*
+Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
+This file will be removed eventually.
+*/
+#include <ATen/Dispatch.h>
+#include <ATen/SparseTensorUtils.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
+
+namespace at {
+namespace native {
+
+void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense) {
+  TORCH_INTERNAL_ASSERT(nnz > 0);
+
+  // No half support, so we don't have to use CUDATypeConversion
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      values.scalar_type(), "addmm_sparse_cuda", [&] {
+        scalar_t cast_beta = beta.to<scalar_t>();
+        scalar_t cast_alpha = alpha.to<scalar_t>();
+        Tensor r__;
+        if (cast_beta == scalar_t(0)) {
+          r_.zero_();
+        } else if (!at::sparse::is_same_tensor(t, r_)) {
+          r_.copy_(t);
+        }
+        if (r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
+          r__ = r_;
+        } else {
+          // Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
+          r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
+          r__.transpose_(0, 1);
+        }
+        TORCH_INTERNAL_ASSERT(r__.transpose(-1, -2).is_contiguous());
+        Tensor dense_;
+        char transpose_dense;
+        if (dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
+          transpose_dense = 'n';
+          dense_ = dense;
+        } else if (dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
+          transpose_dense = 't';
+          dense_ = dense;
+        } else {
+          transpose_dense = 't';
+          dense_ = dense.contiguous();
+        }
+
+        sparse::cuda::csrmm2(
+          'n',
+          transpose_dense,
+          m,
+          n,
+          k,
+          nnz,
+          cast_alpha,
+          values.data_ptr<scalar_t>(),
+          crow_indices.data_ptr<int32_t>(),
+          col_indices.data_ptr<int32_t>(),
+          dense_.data_ptr<scalar_t>(),
+          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
+          cast_beta,
+          r__.data_ptr<scalar_t>(),
+          r__.stride(1));
+
+        if (!at::sparse::is_same_tensor(r__, r_)) {
+          r_.copy_(r__);
+        }
+      }
+    );
+}
+
+} // namespace native
+} // namespace at
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/core/Scalar.h>
+
+/*
+Functions here use deprecated cuSPARSE API that was removed in CUDA 11.
+Here only 32-bit indices sparse indices are supported.
+This file will be removed eventually.
+*/
+
+namespace at {
+namespace native {
+
+void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense);
+
+} // namespace native
+} // namespace at
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/Exception.h>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <c10/cuda/CUDACachingAllocator.h>

 #include <cusparse.h>
@ -14,7 +14,7 @@
 // Using these APIs in any other systems will result in compile-time or run-time failures.
 // Their support will be extended in the next releases.

-#if defined(__CUDACC__) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
+#if defined(CUDART_VERSION) && (CUSPARSE_VERSION >= 11000 || (!defined(_MSC_VER) && CUSPARSE_VERSION >= 10301))
 #define IS_SPMM_AVAILABLE() 1
 #else
 #define IS_SPMM_AVAILABLE() 0
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@ -5,8 +5,9 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/native/sparse/SparseTensorMath.h>
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAUtils.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>
@ -50,64 +51,6 @@ namespace {
  }
 }

-void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense) {
-  TORCH_INTERNAL_ASSERT(nnz > 0);
-
-  // No half support, so we don't have to use CUDATypeConversion
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
-      values.scalar_type(), "addmm_sparse_cuda", [&] {
-        scalar_t cast_beta = beta.to<scalar_t>();
-        scalar_t cast_alpha = alpha.to<scalar_t>();
-        Tensor r__;
-        if (cast_beta == scalar_t(0)) {
-          r_.zero_();
-        } else if (!is_same_tensor(t, r_)) {
-          r_.copy_(t);
-        }
-        if(r_.stride(0) == 1 && r_.stride(1) == r_.size(0)) {
-          r__ = r_;
-        } else {
-          // Note: This storage arrangement is preferred due to most of the CUDA kernels handle only contiguous tensors
-          r__ = r_.transpose(0, 1).clone(at::MemoryFormat::Contiguous);
-          r__.transpose_(0, 1);
-        }
-        Tensor dense_;
-        char transpose_dense;
-        if(dense.stride(0) == 1 && dense.stride(1) == dense.size(0)) {
-          transpose_dense = 'n';
-          dense_ = dense;
-        } else if(dense.stride(1) == 1 && dense.stride(0) == dense.size(1)) {
-          transpose_dense = 't';
-          dense_ = dense;
-        } else {
-          transpose_dense = 't';
-          dense_ = dense.contiguous();
-        }
-
-        sparse::cuda::csrmm2(
-          'n',
-          transpose_dense,
-          m,
-          n,
-          k,
-          nnz,
-          cast_alpha,
-          values.data_ptr<scalar_t>(),
-          crow_indices.data_ptr<int32_t>(),
-          col_indices.data_ptr<int32_t>(),
-          dense_.data_ptr<scalar_t>(),
-          (transpose_dense == 'n' ? dense_.stride(1) : dense_.stride(0)),
-          cast_beta,
-          r__.data_ptr<scalar_t>(),
-          r__.stride(1));
-
-        if (!is_same_tensor(r__, r_)) {
-          r_.copy_(r__);
-        }
-      }
-    );
-}
-
 // NB: Deleted spaddcmul (aka addcmul_, but not actually wired up), spaddcdiv (not
 // wired at all)

--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
@ -6,8 +6,6 @@

 namespace at { namespace native {

-void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& crow_indices, Tensor& col_indices, Tensor& values, const Tensor& dense);
-
 void s_addmm_out_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& indices, Tensor& values, const Tensor& dense);

 }} // namespace at::native
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@ -19,7 +19,8 @@
 #include <ATen/cuda/CUDAUtils.h>
 #include <c10/cuda/CUDACachingAllocator.h>

-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <ATen/native/sparse/cuda/SparseCUDATensorMath.cuh>

 #include <thrust/device_ptr.h>
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@ -18,7 +18,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAUtils.h>
 #include <cusparse.h>
-#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 #include <c10/cuda/CUDACachingAllocator.h>

 #include <thrust/device_vector.h>
--- a/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/leaky_relu.glsl
@ -0,0 +1,28 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1) uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict           Block {
+  ivec4 size;
+  float negative_slope;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 inval = texelFetch(uInput, pos, 0);
+    const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
+    const vec4 positive_values = vec4(1.0) - negative_values;
+    const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
+    const vec4 outval = inval * mask;
+    imageStore(uOutput, pos, outval);
+  }
+}
--- a/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/leaky_relu_.glsl
@ -0,0 +1,27 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION restrict Block {
+  ivec4 size;
+  float negative_slope;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 inval = imageLoad(uOutput, pos);
+    const vec4 negative_values = vec4(lessThan(inval, vec4(0.0f)));
+    const vec4 positive_values = vec4(1.0) - negative_values;
+    const vec4 mask = negative_values * vec4(uBlock.negative_slope) + positive_values;
+    const vec4 outval = inval * mask;
+    imageStore(uOutput, pos, outval);
+  }
+}
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@ -404,6 +404,121 @@ Tensor& hardshrink_(
  return self;
 }

+Tensor leaky_relu(
+    const Tensor& self_arg,
+    const Scalar& negative_slope) {
+  api::Context* const context = api::context();
+
+  const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
+  const vTensor& v_self = convert(self);
+
+  vTensor v_output{
+    context,
+    v_self.sizes(),
+    v_self.options(),
+  };
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    if C10_LIKELY(v_output.has_image() && v_self.has_image()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float negative_slope;
+      } block {
+        v_output.extents(),
+        0u,
+        negative_slope.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(leaky_relu),
+          v_output.extents(),
+          context->gpu().adapter->local_work_group_size(),
+          // Write-only access bypasses synchronization but inserts appropriate
+          // barriers if necessary.
+          v_output.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Write),
+          // Read-only access is implied on const tensors and triggers an async
+          // synchronization if necessary.
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return convert(v_output);
+}
+
+Tensor& leaky_relu_(
+    Tensor& self,
+    const Scalar& negative_slope) {
+  api::Context* const context = api::context();
+
+  TORCH_CHECK(
+      self.is_vulkan(),
+      "Vulkan: In-place leaky relu is only supported on Vulkan tensors.");
+
+  vTensor& v_self = convert(self);
+
+  api::Command::Pool& command_pool = context->command().pool;
+  api::Command::Buffer& command_buffer = command_pool.stream();
+  {
+    if C10_LIKELY(v_self.has_image()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float negative_slope;
+      } block {
+        v_self.extents(),
+        0u,
+        negative_slope.to<float>(),
+      };
+
+      context->dispatch(
+          command_buffer,
+          {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          },
+          VK_KERNEL(leaky_relu_),
+          v_self.extents(),
+          context->gpu().adapter->local_work_group_size(),
+          // Read-Write access triggers an async synchronization if necessory
+          // and inserts appropriate barriers if hazards are detected.
+          v_self.image(
+              command_buffer,
+              vTensor::Stage::Compute,
+              vTensor::Access::Read | vTensor::Access::Write),
+          // Object lifetime is managed by the resource pool.
+          // It is OK not to keep track of the handle.
+          context->resource().pool.uniform(block).object);
+    }
+    else {
+      TORCH_CHECK(false, "Not implemented!");
+    }
+  }
+  command_pool.submit(context->gpu().queue, command_buffer);
+
+  return self;
+}
+
 Tensor sigmoid(const Tensor& self) {
  return ops::activation(self, VK_KERNEL(sigmoid));
 }
@ -433,6 +548,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
  m.impl(TORCH_SELECTIVE_NAME("aten::hardswish_"), hardswish_);
  m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh"), hardtanh);
  m.impl(TORCH_SELECTIVE_NAME("aten::hardtanh_"), hardtanh_);
+  m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu"), leaky_relu);
+  m.impl(TORCH_SELECTIVE_NAME("aten::leaky_relu_"), leaky_relu_);
  m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), sigmoid);
  m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid_"), sigmoid_);
  m.impl(TORCH_SELECTIVE_NAME("aten::tanh"), tanh);
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@ -43,6 +43,8 @@ namespace at {
 // at namespace already.
 namespace {

+${dispatch_helpers}
+
 ${dispatch_anonymous_definitions}

 TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@ -979,6 +979,49 @@ TEST(VulkanAPITest, hardshrink_) {
  }
 }

+TEST(VulkanAPITest, leaky_relu) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
+    const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_vulkan = in_cpu.vulkan();
+
+    const auto out_cpu = at::leaky_relu(in_cpu, negative_slope);
+    const auto out_vulkan = at::leaky_relu(in_vulkan, negative_slope);
+
+    const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+
+    if (!check) {
+      showRtol(out_cpu, out_vulkan.cpu());
+    }
+
+    ASSERT_TRUE(check);
+  }
+}
+
+TEST(VulkanAPITest, leaky_relu_) {
+  if (!at::is_vulkan_available()) {
+    return;
+  }
+
+  for (const auto negative_slope : {0.01, 0.001, 1.0, -0.001}) {
+    auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+    auto vulkan = cpu.vulkan();
+
+    at::leaky_relu_(cpu, negative_slope);
+    at::leaky_relu_(vulkan, negative_slope);
+
+    const auto check = almostEqual(cpu, vulkan.cpu());
+    if (!check) {
+      showRtol(cpu, vulkan.cpu());
+    }
+
+    ASSERT_TRUE(check);
+  }
+}
+
 TEST(VulkanAPITest, hardswish) {
  if (!at::is_vulkan_available()) {
    return;
--- a/benchmarks/cpp/tensorexpr/CMakeLists.txt
+++ b/benchmarks/cpp/tensorexpr/CMakeLists.txt
@ -6,6 +6,7 @@ add_executable(
  bench_batchnorm.cpp
  bench_concat.cpp
  bench_compile.cpp
+  bench_signed_log1p.cpp
  bench_fuser_overhead.cpp
  bench_gemm.cpp
  bench_parallel.cpp
--- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
@ -0,0 +1,120 @@
+#include <benchmark/benchmark.h>
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/torch.h>
+
+using namespace torch::jit::tensorexpr;
+
+namespace {
+
+class SignedLog1pBench : public benchmark::Fixture {
+ public:
+  void SetUp(const benchmark::State& state) override {
+    input_size_ = {state.range(0), state.range(1)};
+    input_size_int_ = {state.range(0), state.range(1)};
+    input_ = torch::rand(input_size_);
+    ref_ = signedLog1p(input_);
+  }
+
+  void TearDown(benchmark::State& state) override {
+    TORCH_CHECK(at::allclose(ref_, output_));
+    state.counters["GB/s"] = benchmark::Counter(
+        uint64_t(state.iterations()) * 2 * output_.nbytes(),
+        benchmark::Counter::kIsRate);
+  }
+
+  at::Tensor signedLog1p(const at::Tensor& inp) {
+    auto sign = at::sign(inp);
+    auto log1p = at::log1p(at::abs(inp));
+    return sign * log1p;
+  }
+
+  void runATen(benchmark::State& state) {
+    for (auto _ : state) {
+      output_ = signedLog1p(input_);
+    }
+  }
+
+  void runNNC(benchmark::State& state) {
+    Placeholder input_ph(
+        "input", kFloat, {input_size_int_[0], input_size_int_[1]});
+    Tensor abs_result = Compute(
+        "aten_abs",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return abs(input_ph.load(m, n));
+        });
+    Tensor log1p_result = Compute(
+        "aten_log1p",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return log1p(abs_result.load(m, n));
+        });
+    Tensor sign = Compute(
+        "aten_sign",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return CompareSelect::make(
+              input_ph.load(m, n),
+              ExprHandle(0.0f),
+              ExprHandle(-1),
+              ExprHandle(1),
+              kLT);
+        });
+    Tensor output = Compute(
+        "aten_mul",
+        {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
+        [&](const VarHandle& m, const VarHandle& n) {
+          return sign.load(m, n) * log1p_result.load(m, n);
+        });
+    LoopNest nest({output}, {abs_result, log1p_result, sign, output});
+    GRAPH_DEBUG("Original Stmt: ", *nest.root_stmt());
+    nest.inlineIntermediateBufs(true);
+    nest.prepareForCodegen();
+    nest.simplify();
+    nest.vectorizeInnerLoops();
+    nest.simplify();
+    GRAPH_DEBUG("Final stmt: ", *nest.root_stmt());
+
+    // StmtPtr s = IRSimplifier::simplify(nest.root_stmt());
+    std::vector<CodeGen::BufferArg> buf_args;
+    buf_args.push_back(input_ph);
+    buf_args.push_back(output);
+    LLVMCodeGen cg(nest.root_stmt(), buf_args);
+
+    std::vector<CodeGen::CallArg> call_args;
+    for (auto _ : state) {
+      output_ = at::empty_like(ref_);
+      call_args.clear();
+      call_args.push_back(input_.data_ptr<float>());
+      call_args.push_back(output_.data_ptr<float>());
+      cg.call(call_args);
+    }
+  }
+
+ private:
+  std::vector<long> input_size_;
+  std::vector<int> input_size_int_;
+  at::Tensor input_;
+  at::Tensor output_;
+  at::Tensor ref_;
+};
+
+} // namespace
+
+BENCHMARK_DEFINE_F(SignedLog1pBench, ATen)(benchmark::State& state) {
+  runATen(state);
+}
+
+BENCHMARK_DEFINE_F(SignedLog1pBench, NNC)(benchmark::State& state) {
+  runNNC(state);
+}
+
+BENCHMARK_REGISTER_F(SignedLog1pBench, ATen)->Args({10, 1467});
+
+BENCHMARK_REGISTER_F(SignedLog1pBench, NNC)->Args({10, 1467});
--- a/benchmarks/static_runtime/test_scripts.h
+++ b/benchmarks/static_runtime/test_scripts.h
@ -762,3 +762,42 @@ const std::string quantize_script = R"IR(
      %1249: Tensor = aten::dequantize(%1254)
      return (%1249)
 )IR";
+
+const auto fmod_tensor = R"JIT(
+  def forward(self, a: Tensor, b: Tensor):
+      return torch.fmod(a, b).clone()
+)JIT";
+
+const auto fmod_scalar = R"JIT(
+  def forward(self, a: Tensor, b: int):
+      return torch.fmod(a, b).clone()
+)JIT";
+
+const std::string embedding_bag_byte_prepack_script = R"IR(
+  graph(%input: Tensor):
+      %none : None = prim::Constant()
+      %output: Tensor = quantized::embedding_bag_byte_prepack(%input)
+      %res: Tensor = aten::clone(%output, %none)
+      return (%res)
+)IR";
+
+const auto linalg_norm_ord_scalar = R"JIT(
+  def forward(self, a: Tensor, ord: int, dim: List[int], keepdim: bool, dtype: int):
+      return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
+)JIT";
+
+const auto linalg_norm_ord_str = R"JIT(
+  def forward(self, a: Tensor, ord: str, dim: List[int], keepdim: bool, dtype: int):
+      return torch.linalg_norm(a, ord, dim, keepdim, dtype=dtype).clone()
+)JIT";
+
+const std::string cat_script = R"IR(
+  graph(%a: Tensor, %b: Tensor, %dim: int):
+      %ten_list: Tensor[] = prim::ListConstruct(%a, %b)
+      %1 : int = prim::Constant[value=0]()
+      %2 : int = prim::Constant[value=1]()
+      %3 : int = prim::Constant[value=1]()
+      %ten_list2 : Tensor[] = aten::slice(%ten_list, %1, %2, %3)
+      %ret: Tensor = aten::cat(%ten_list2, %dim)
+      return (%ret)
+)IR";
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/runtime/static/fusion.h>
 #include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/passes.h>
@ -209,6 +210,13 @@ TEST(StaticRuntime, EmbeddingBag) {
 }

 TEST(StaticRuntime, LayerNorm) {
+#ifdef FBCODE_CAFFE2
+  script::Module module("module");
+  module.define(layer_norm_with_weights);
+  torch::jit::StaticModule smodule(module);
+  ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr);
+  ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr);
+#endif
  const auto a = torch::rand({1, 2, 2, 2});
  const auto b = torch::rand({3, 2, 2, 2});
  for (int normalized_size : {2, 3}) {
@ -1223,3 +1231,83 @@ TEST(StaticRuntime, IndividualOps_VarStack) {

  testStaticRuntime(var_stack_script, args1, args2);
 }
+
+TEST(StaticRuntime, IndividualOps_FmodTensor) {
+  // fmod tensor version
+  auto a = at::randn({2, 3});
+  auto b = at::randn({2, 3});
+  std::vector<IValue> args0{a, b};
+  testStaticRuntime(fmod_tensor, args0);
+
+  // check for dynamic shapes
+  auto c = at::randn({4, 3, 2});
+  auto d = at::randn({4, 3, 2});
+  std::vector<IValue> args1{c, d};
+  testStaticRuntime(fmod_tensor, args0, args1);
+}
+
+TEST(StaticRuntime, IndividualOps_FmodScalar) {
+  auto a = at::randn({2, 3});
+
+  // fmod scalar version
+  std::vector<IValue> args2{a, 3};
+  testStaticRuntime(fmod_scalar, args2);
+
+  // check for dynamic shapes
+  auto c = at::randn({4, 3, 2});
+  std::vector<IValue> args3{c, 4};
+  testStaticRuntime(fmod_scalar, args2, args3);
+}
+
+TEST(StaticRuntime, QEmbeddingBagByteUnpack) {
+  auto a = torch::randn({8, 16}, at::ScalarType::Float);
+  auto b = torch::randn({8*2, 16*2}, at::ScalarType::Float);
+
+  testStaticRuntime(embedding_bag_byte_prepack_script, {a});
+  testStaticRuntime(embedding_bag_byte_prepack_script, {a},{b});
+}
+
+TEST(StaticRuntime, IndividualOps_LinalgNorm_ScalarOrd) {
+  auto a = at::randn({2, 3});
+  auto dim = std::vector<int64_t>({1});
+  auto dtype = at::ScalarType::Float;
+
+  std::vector<IValue> args0{a, 4, dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_scalar, args0);
+
+  auto b = at::randn({4, 5});
+  std::vector<IValue> args1{b, 4, dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_scalar, args0, args1);
+}
+
+TEST(StaticRuntime, IndividualOps_LinalgNorm_StringOrd) {
+  auto a = at::randn({2, 3});
+  auto dim = std::vector<int64_t>({0, 1});
+  auto dtype = at::ScalarType::Float;
+
+  std::vector<IValue> args0{a, "fro", dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_str, args0);
+
+  auto b = at::randn({4, 5});
+  std::vector<IValue> args1{b, "fro", dim, true, dtype};
+  testStaticRuntime(linalg_norm_ord_str, args0, args1);
+}
+
+TEST(StaticRuntime, IndividualOps_Cat) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(cat_script, graph.get(), vmap);
+  torch::jit::StaticModule smodule(graph);
+  ASSERT_TRUE(getNodeWithKind(smodule, "aten::cat"));
+
+  auto a = at::randn({2, 4});
+  auto b = at::randn({3, 4});
+  std::vector<IValue> args0{a, b, 0};
+
+  testStaticRuntime(cat_script, args0);
+
+  auto c = at::randn({3, 4});
+  auto d = at::randn({3, 5});
+  std::vector<IValue> args1{c, d, 1};
+  testStaticRuntime(cat_script, args0, args1);
+}
--- a/c10/core/AutogradState.cpp
+++ b/c10/core/AutogradState.cpp
@ -4,8 +4,10 @@ namespace c10 {

 namespace {
 // By default, grad mode is enabled and inference mode is disabled
-thread_local AutogradState autograd_state_tls =
-    AutogradState(/* grad_mode */ true, /* inference_mode */ false);
+thread_local AutogradState autograd_state_tls = AutogradState(
+    /* grad_mode */ true,
+    /* inference_mode */ false,
+    /* fw_grad_mode */ true);
 } // namespace

 AutogradState& AutogradState::get_tls_state() {
--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@ -12,13 +12,19 @@ struct C10_API AutogradState {
  static AutogradState& get_tls_state();
  static void set_tls_state(AutogradState state);

-  AutogradState(bool grad_mode, bool inference_mode)
-      : grad_mode_(grad_mode), inference_mode_(inference_mode) {}
+  AutogradState(bool grad_mode, bool inference_mode, bool fw_grad_mode)
+      : grad_mode_(grad_mode),
+        inference_mode_(inference_mode),
+        fw_grad_mode_(fw_grad_mode) {}

  void set_grad_mode(bool enabled) {
    grad_mode_ = enabled;
  }

+  void set_fw_grad_mode(bool enabled) {
+    fw_grad_mode_ = enabled;
+  }
+
  void set_inference_mode(bool enabled) {
    inference_mode_ = enabled;
  }
@ -27,6 +33,10 @@ struct C10_API AutogradState {
    return grad_mode_;
  }

+  bool get_fw_grad_mode() const {
+    return fw_grad_mode_;
+  }
+
  bool get_inference_mode() const {
    return inference_mode_;
  }
@ -34,6 +44,7 @@ struct C10_API AutogradState {
 private:
  bool grad_mode_ : 1;
  bool inference_mode_ : 1;
+  bool fw_grad_mode_ : 1;
 };

 } // namespace c10
--- a/c10/core/GradMode.cpp
+++ b/c10/core/GradMode.cpp
@ -1,4 +1,3 @@
-#include <c10/core/AutogradState.h>
 #include <c10/core/GradMode.h>

 #include <stdexcept>
--- a/c10/core/GradMode.h
+++ b/c10/core/GradMode.h
@ -1,5 +1,6 @@
 #pragma once

+#include <c10/core/AutogradState.h>
 #include <c10/macros/Macros.h>

 namespace c10 {
@ -27,4 +28,17 @@ struct TORCH_API NoGradGuard : public AutoGradMode {
  NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
 };

+// A RAII, thread local (!) guard that enables or disables forward grad mode
+// upon construction, and sets it back to the original value upon destruction.
+struct TORCH_API AutoFwGradMode {
+  AutoFwGradMode(bool enabled)
+      : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
+    AutogradState::get_tls_state().set_fw_grad_mode(enabled);
+  }
+  ~AutoFwGradMode() {
+    AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
+  }
+  bool prev_mode;
+};
+
 } // namespace c10
--- a/c10/core/InferenceMode.h
+++ b/c10/core/InferenceMode.h
@ -53,10 +53,12 @@ struct TORCH_API InferenceMode {
  InferenceMode(bool enabled = true)
      : prev_mode(AutogradState::get_tls_state()),
        prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
-    // Enabling inference mode means disabling grad mode
-    // And disabling inference mode means enabling grad mode
-    AutogradState::set_tls_state(
-        AutogradState(/* grad_mode */ !enabled, /* inference_mode */ enabled));
+    // Enabling inference mode means disabling grad modes
+    // And disabling inference mode means enabling grad modes
+    AutogradState::set_tls_state(AutogradState(
+        /* grad_mode */ !enabled,
+        /* inference_mode */ enabled,
+        /* fw_grad_mode */ !enabled));
    DispatchKeySet included = enabled
        ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
        : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -68,7 +68,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
  StorageImpl() = delete;
  StorageImpl(StorageImpl&& other) = default;
  StorageImpl(const StorageImpl&) = delete;
-  ~StorageImpl() = default;
+  ~StorageImpl() override = default;

  void reset() {
    data_ptr_.clear();
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@ -6,12 +6,13 @@
 #include <initializer_list>
 #include <ostream>
 #include <set>
-#include <vector>
 #include <unordered_map>
+#include <vector>

 #include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/filler.h"
 #include "caffe2/utils/proto_utils.h"
@ -273,8 +274,8 @@ class TORCH_API OpSchema {
  OpSchema&
  Arg(const char* name, const char* description, bool required = false);

-#define DECLARE_STANDARD_ARG(name, str)     \
-  static const char* Arg_##name; \
+#define DECLARE_STANDARD_ARG(name, str) \
+  static const char* Arg_##name;        \
  OpSchema& Arg##name(const char* description);

  DECLARE_STANDARD_ARG(IsTest, is_test)
@ -339,7 +340,9 @@ class TORCH_API OpSchema {
    return inplace_enforced_(x, y);
  }

-  TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const OpSchema& schema);

  const std::vector<Argument>& args() const {
    return args_;
@ -562,8 +565,10 @@ OpSchema::Cost PointwiseCostInference(
  }

  c.flops = nElemX * OpsPerPoint;
-  c.bytes_read = nElemRead * sizeof(X.data_type());
-  c.bytes_written = nElemX * sizeof(X.data_type());
+  auto const& X_element_size_byte =
+      DataTypeToTypeMeta(X.data_type()).itemsize();
+  c.bytes_read = nElemRead * X_element_size_byte;
+  c.bytes_written = nElemX * X_element_size_byte;
  return c;
 }

--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@ -1,6 +1,7 @@
 #include "caffe2/operators/batch_matmul_op.h"

 #include "caffe2/core/operator_schema.h"
+#include "caffe2/core/types.h"

 namespace caffe2 {

@ -116,9 +117,13 @@ OpSchema::Cost CostInferenceForBatchMatMul(
    K = in[0].dims(ndims_A - 1);
  }

+  auto const& A_element_size_byte =
+      DataTypeToTypeMeta(A.data_type()).itemsize();
+  auto const& Y_element_size_byte =
+      DataTypeToTypeMeta(Y.data_type()).itemsize();
  c.flops = 2 * nElemY * K;
-  c.bytes_read = (nElemA + nElemB) * sizeof(A.data_type());
-  c.bytes_written = nElemY * sizeof(Y.data_type());
+  c.bytes_read = (nElemA + nElemB) * A_element_size_byte;
+  c.bytes_written = nElemY * Y_element_size_byte;
  c.params_bytes = 0;
  return c;
 }
@ -180,72 +185,76 @@ class GetBatchMatMulGradient : public GradientMakerBase {
    auto no_trans_arg = vector<Argument>();
    auto trans_a_arg = vector<Argument>{MakeArgument<int>("trans_a", 1)};
    auto trans_b_arg = vector<Argument>{MakeArgument<int>("trans_b", 1)};
-    auto trans_both_arg = vector<Argument>{MakeArgument<int>("trans_a", 1),
-                                           MakeArgument<int>("trans_b", 1)};
+    auto trans_both_arg = vector<Argument>{
+        MakeArgument<int>("trans_a", 1), MakeArgument<int>("trans_b", 1)};

    if (trans_a) {
      if (trans_b) {
        // A'B':
        // dA = B'G', dB = G'A'
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(1), GO(0)},
-                                       vector<string>{GI(0)},
-                                       trans_both_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(0)},
-                                       vector<string>{GI(1)},
-                                       trans_both_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_both_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_both_arg)};
      } else {
        // A'B:
        // dA = BG', dB = AG
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(1), GO(0)},
-                                       vector<string>{GI(0)},
-                                       trans_b_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(0), GO(0)},
-                                       vector<string>{GI(1)},
-                                       no_trans_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(1), GO(0)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                no_trans_arg)};
      }
    } else {
      if (trans_b) {
        // AB':
        // dA = GB, dB = G'A
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(1)},
-                                       vector<string>{GI(0)},
-                                       no_trans_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(0)},
-                                       vector<string>{GI(1)},
-                                       trans_a_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                no_trans_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
      } else {
        // AB:
        // dA = GB', dB = A'G
-        return vector<OperatorDef>{CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{GO(0), I(1)},
-                                       vector<string>{GI(0)},
-                                       trans_b_arg),
-                                   CreateOperatorDef(
-                                       "BatchMatMul",
-                                       "",
-                                       vector<string>{I(0), GO(0)},
-                                       vector<string>{GI(1)},
-                                       trans_a_arg)};
+        return vector<OperatorDef>{
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{GO(0), I(1)},
+                vector<string>{GI(0)},
+                trans_b_arg),
+            CreateOperatorDef(
+                "BatchMatMul",
+                "",
+                vector<string>{I(0), GO(0)},
+                vector<string>{GI(1)},
+                trans_a_arg)};
      }
    }
  }
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@ -101,9 +101,12 @@ OpSchema::Cost CostInferenceForSplit(
  CAFFE_ENFORCE_GT(in.size(), 0);
  struct OpSchema::Cost cost;
  cost.flops = 0;
-  auto input_bytes_count = nElemFromDim(in[0]) * sizeof(in[0].data_type());
-  auto split_bytes_count =
-      (in.size() == 1) ? 0 : nElemFromDim(in[1]) * sizeof(in[1].data_type());
+  auto const& input_0_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
+  auto const& input_1_element_size_byte =
+      (in.size() > 1) ? DataTypeToTypeMeta(in[1].data_type()).itemsize() : 0;
+  auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
+  auto split_bytes_count = nElemFromDim(in[1]) * input_1_element_size_byte;
  // There can be two input blobs:
  // (1) actual tensor to be split
  // (2) lengths of outputs along split axis
@ -329,11 +332,13 @@ OpSchema::Cost CostInferenceForConcat(
  }
  auto split_info_bytes_count = in.size() * sizeof(int);

+  auto const& input_0_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
  struct OpSchema::Cost cost;
  cost.flops = 0;
-  cost.bytes_read = nElemRead * sizeof(in[0].data_type());
+  cost.bytes_read = nElemRead * input_0_element_size_byte;
  cost.bytes_written =
-      size * sizeof(in[0].data_type()) + split_info_bytes_count;
+      size * input_0_element_size_byte + split_info_bytes_count;
  cost.params_bytes = 0;
  return cost;
 }
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@ -7,6 +7,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2_legacy.pb.h"
 #include "caffe2/utils/math.h"

@ -519,14 +520,20 @@ class ConvPoolOpBase : public Operator<Context> {
    uint64_t nElemW = nElemFromDim(W);
    uint64_t nElemBias = inputs.size() > 2 ? nElemFromDim(inputs[2]) : 0;

+    auto const& X_elemenet_size_byte =
+        DataTypeToTypeMeta(X.data_type()).itemsize();
+    auto const& Y_element_size_byte =
+        DataTypeToTypeMeta(Y.data_type()).itemsize();
+    auto const& W_element_size_byte =
+        DataTypeToTypeMeta(W.data_type()).itemsize();
+
    // grouping is NOT properly handled yet
    c.flops = N * Y_t * Y_h * Y_w * kernel_t * kernel_w * kernel_h *
        in_channels * out_channels * 2;
-    c.bytes_read = (nElemX + nElemW + nElemBias) * sizeof(X.data_type());
-    c.bytes_written =
-        N * out_channels * Y_t * Y_h * Y_w * sizeof(Y.data_type());
+    c.bytes_read = (nElemX + nElemW + nElemBias) * X_elemenet_size_byte;
+    c.bytes_written = N * out_channels * Y_t * Y_h * Y_w * Y_element_size_byte;
    c.params_bytes = out_channels * in_channels * kernel_t * kernel_h *
-        kernel_w * sizeof(W.data_type());
+        kernel_w * W_element_size_byte;
    return c;
  }

--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@ -1,4 +1,5 @@
 #include "caffe2/operators/distance_op.h"
+#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"
 #ifdef CAFFE2_USE_MKLDNN
 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
@ -7,7 +8,7 @@

 namespace caffe2 {

-template<>
+template <>
 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& Y = Input(1);
@ -257,7 +258,9 @@ OpSchema::Cost CostInferenceForDotProduct(
  CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);

  struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
-  c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
+  auto const& out_0_element_size_byte =
+      DataTypeToTypeMeta(out[0].data_type()).itemsize();
+  c.bytes_written = out[0].dims(0) * out_0_element_size_byte;
  c.params_bytes = 0;
  return c;
 }
@ -379,10 +382,12 @@ bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
 }

 // L2
-REGISTER_CPU_OPERATOR(SquaredL2Distance,
-                      SquaredL2DistanceOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
-                      SquaredL2DistanceGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SquaredL2Distance,
+    SquaredL2DistanceOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    SquaredL2DistanceGradient,
+    SquaredL2DistanceGradientOp<float, CPUContext>);

 OPERATOR_SCHEMA(SquaredL2Distance)
    .NumInputs(2)
@ -402,7 +407,8 @@ class GetSquaredL2DistanceGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
-        "SquaredL2DistanceGradient", "",
+        "SquaredL2DistanceGradient",
+        "",
        vector<string>{I(0), I(1), GO(0)},
        vector<string>{GI(0), GI(1)});
  }
@ -762,9 +768,9 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
      replicate = GetArgument(Def(), "replicate").i();
    }

-    const auto dot_arg =
-        vector<Argument>{MakeArgument<float>("pad_value", pad_value),
-                         MakeArgument<bool>("replicate", replicate)};
+    const auto dot_arg = vector<Argument>{
+        MakeArgument<float>("pad_value", pad_value),
+        MakeArgument<bool>("replicate", replicate)};

    return SingleGradientDef(
        "DotProductWithPaddingGradient",
@ -775,4 +781,4 @@ class GetDotProductWithPaddingGradient : public GradientMakerBase {
  }
 };
 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
-}  // namespace caffe2
+} // namespace caffe2
--- a/caffe2/operators/fc_inference.cc
+++ b/caffe2/operators/fc_inference.cc
@ -1,4 +1,5 @@
 #include "caffe2/operators/fc_inference.h"
+#include "caffe2/core/types.h"

 namespace caffe2 {
 std::vector<TensorShape> FCShapeInference(
@ -51,11 +52,12 @@ OpSchema::Cost CostInferenceForFC(
      ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
      : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

-  const auto& X = in[0];
+  auto const& X_element_size_byte =
+      DataTypeToTypeMeta(in[0].data_type()).itemsize();
  c.flops = M * N * (2 * K + 1);
-  c.bytes_read = (K * (M + N) + N) * sizeof(X.data_type());
-  c.bytes_written = M * N * sizeof(X.data_type());
-  c.params_bytes = (K * N + N) * sizeof(X.data_type());
+  c.bytes_read = (K * (M + N) + N) * X_element_size_byte;
+  c.bytes_written = M * N * X_element_size_byte;
+  c.params_bytes = (K * N + N) * X_element_size_byte;
  return c;
 }

@ -94,7 +96,11 @@ OpSchema::Cost CostInferenceForFCGradient(

  CAFFE_ENFORCE_LT(0, out.size());
  const TensorShape dW = out[0];
+  auto const& dW_element_size_byte =
+      DataTypeToTypeMeta(dW.data_type()).itemsize();
  const TensorShape db = out[1];
+  auto const& db_element_size_byte =
+      DataTypeToTypeMeta(db.data_type()).itemsize();

  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
  const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
@ -111,15 +117,17 @@ OpSchema::Cost CostInferenceForFCGradient(
  uint64_t size_db = nElemFromDim(db);

  c.flops = M * N * (2 * K + 1);
-  c.bytes_written = (size_dW + size_db) * sizeof(float);
+  c.bytes_written =
+      size_dW * dW_element_size_byte + size_db * db_element_size_byte;
  c.params_bytes = (K * N + N) * sizeof(float);

  if (out.size() == 3) {
    const TensorShape dX = out[2];
    uint64_t size_dX = nElemFromDim(dX);
-
+    auto const& dX_element_size_byte =
+        DataTypeToTypeMeta(dX.data_type()).itemsize();
    c.flops += 2 * M * N * K;
-    c.bytes_written += size_dX * sizeof(float);
+    c.bytes_written += size_dX * dX_element_size_byte;
  }
  return c;
 }
--- a/caffe2/operators/one_hot_ops.cc
+++ b/caffe2/operators/one_hot_ops.cc
@ -2,6 +2,7 @@

 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "caffe2/core/types.h"

 namespace caffe2 {

@ -78,12 +79,21 @@ OpSchema::Cost CostInferenceForBatchOneHot(
  const auto& length = in[1];
  const auto& values = in[2];

-  uint64_t nBytesData = nElemFromDim(data) * sizeof(data.data_type());
-  uint64_t nBytesLength = nElemFromDim(length) * sizeof(length.data_type());
-  uint64_t nBytesValues = nElemFromDim(values) * sizeof(values.data_type());
+  auto const& data_element_size_byte =
+      DataTypeToTypeMeta(data.data_type()).itemsize();
+  auto const& length_element_size_byte =
+      DataTypeToTypeMeta(length.data_type()).itemsize();
+  auto const& values_element_size_byte =
+      DataTypeToTypeMeta(values.data_type()).itemsize();
+  auto const& output_element_size_byte =
+      DataTypeToTypeMeta(output.data_type()).itemsize();
+
+  uint64_t nBytesData = nElemFromDim(data) * data_element_size_byte;
+  uint64_t nBytesLength = nElemFromDim(length) * length_element_size_byte;
+  uint64_t nBytesValues = nElemFromDim(values) * values_element_size_byte;
  c.flops = 0;
  c.bytes_read = nBytesData + nBytesLength + nBytesValues;
-  c.bytes_written = nElemFromDim(output) * sizeof(output.data_type());
+  c.bytes_written = nElemFromDim(output) * output_element_size_byte;
  c.params_bytes = 0;
  return c;
 }
@ -145,15 +155,15 @@ bool BatchBucketOneHotOp<CPUContext>::RunOnDevice() {
    for (int64_t j = 0; j < D; j++) {
      // here we assume the boundary values for each feature are sorted
      int64_t lower_bucket_idx = std::lower_bound(
-                                    boundaries_offset,
-                                    boundaries_offset + lens_data[j],
-                                    input_data[pos]) -
+                                     boundaries_offset,
+                                     boundaries_offset + lens_data[j],
+                                     input_data[pos]) -
          boundaries_offset;

      int64_t upper_bucket_idx = std::upper_bound(
-                                    boundaries_offset,
-                                    boundaries_offset + lens_data[j],
-                                    input_data[pos]) -
+                                     boundaries_offset,
+                                     boundaries_offset + lens_data[j],
+                                     input_data[pos]) -
          boundaries_offset;

      int64_t bucket_idx = (lower_bucket_idx + upper_bucket_idx) / 2;
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@ -1,6 +1,7 @@
 #include "caffe2/operators/utility_ops.h"
 #include <cmath>
 #include <iostream>
+#include "caffe2/core/types.h"
 #include "caffe2/utils/eigen_utils.h"

 namespace caffe2 {
@ -34,9 +35,11 @@ OpSchema::Cost CostInferenceForWeightedSum(
  const auto& nElem = nElemFromDim(X0);
  const auto& nInputs = in.size();
  c.flops = (nInputs - 1) * nElem;
-  c.bytes_read = (nInputs / 2) * (nElem + 1) * sizeof(X0.data_type());
-  c.bytes_written = nElem * sizeof(X0.data_type());
-  c.params_bytes = (nInputs / 2) * sizeof(X0.data_type());
+  auto const& X0_element_size_byte =
+      DataTypeToTypeMeta(X0.data_type()).itemsize();
+  c.bytes_read = (nInputs / 2) * (nElem + 1) * X0_element_size_byte;
+  c.bytes_written = nElem * X0_element_size_byte;
+  c.params_bytes = (nInputs / 2) * X0_element_size_byte;
  return c;
 }

@ -48,9 +51,7 @@ REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
-REGISTER_CPU_OPERATOR(
-    ScatterWeightedSum,
-    ScatterWeightedSumOp<CPUContext>);
+REGISTER_CPU_OPERATOR(ScatterWeightedSum, ScatterWeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);

--- a/caffe2/python/operator_test/concat_op_cost_test.py
+++ b/caffe2/python/operator_test/concat_op_cost_test.py
@ -7,33 +7,39 @@ from caffe2.python.test_util import TestCase

 class TestConcatOpCost(TestCase):
    def test_columnwise_concat(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=np.int32))
-        concat_op = core.CreateOperator(
-            "Concat",
-            ["input_1", "input_2"],
-            ["output", "split_info"],
-        )
-        workspace.RunOperatorOnce(concat_op)
+        def _test_columnwise_concat_for_type(dtype):
+            workspace.ResetWorkspace()
+            workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
+            workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
+            concat_op = core.CreateOperator(
+                "Concat",
+                ["input_1", "input_2"],
+                ["output", "split_info"],
+            )
+            workspace.RunOperatorOnce(concat_op)

-        output = workspace.FetchBlob("output")
-        self.assertTupleEqual(output.shape, (2, 4))
-        np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
+            output = workspace.FetchBlob("output")
+            self.assertTupleEqual(output.shape, (2, 4))
+            np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])

-        flops, bytes_written, bytes_read = workspace.GetOperatorCost(
-            concat_op, concat_op.input
-        )
+            flops, bytes_written, bytes_read = workspace.GetOperatorCost(
+                concat_op, concat_op.input
+            )

-        self.assertEqual(flops, 0)
-        self.assertEqual(
-            bytes_read,
-            sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
-        )
-        self.assertEqual(
-            bytes_written,
-            sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
-        )
+            self.assertEqual(flops, 0)
+            self.assertEqual(
+                bytes_read,
+                sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
+            )
+            self.assertEqual(
+                bytes_written,
+                sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
+            )
+
+        [
+            _test_columnwise_concat_for_type(t)
+            for t in [np.int64, np.float, np.half, np.int8]
+        ]

    def test_split_then_concat(self):
        workspace.ResetWorkspace()
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@ -60,7 +60,7 @@ class TestWorkspace(unittest.TestCase):
        self.assertTupleEqual(
            op_cost,
            namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
-                1152, 256, 2084
+                1152, 256, 4168
            ),
        )

--- a/caffe2/serialize/istream_adapter.h
+++ b/caffe2/serialize/istream_adapter.h
@ -16,7 +16,7 @@ class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
  size_t size() const override;
  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
      const override;
-  ~IStreamAdapter();
+  ~IStreamAdapter() override;

 private:
  std::istream* istream_;
--- a/caffe2/sgd/adagrad_op.cc
+++ b/caffe2/sgd/adagrad_op.cc
@ -1,4 +1,5 @@
 #include "adagrad_op.h"
+#include "caffe2/core/types.h"

 namespace caffe2 {

@ -23,22 +24,30 @@ static OpSchema::Cost CostInferenceForAdagrad(
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
  c.flops = grad_size * 10;

+  auto const& moment_element_size_byte =
+      DataTypeToTypeMeta(moment.data_type()).itemsize();
+  auto const& param_element_size_byte =
+      DataTypeToTypeMeta(param.data_type()).itemsize();
+  auto const& grad_element_size_byte =
+      DataTypeToTypeMeta(grad.data_type()).itemsize();
+  auto const& lr_element_size_byte =
+      DataTypeToTypeMeta(lr.data_type()).itemsize();
  uint64_t bytes_written =
-      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+      grad_size * param_element_size_byte + moment_element_size_byte;

  if (output_size == 3) {
    // also need to output effective learning rate in this case
    // assume it's the same data type as lr
-    bytes_written += grad_size * sizeof(lr.data_type());
+    bytes_written += grad_size * lr_element_size_byte;
  } else if (output_size == 4) {
    // also need to output effective learning rate and updates in this case
    // assume update is the same data type as param
    bytes_written +=
-        grad_size * (sizeof(lr.data_type()) + sizeof(param.data_type()));
+        grad_size * (lr_element_size_byte + param_element_size_byte);
  }
  c.bytes_written = bytes_written;
  c.bytes_read = c.bytes_written +
-      grad_size * (sizeof(grad.data_type()) + sizeof(lr.data_type()));
+      grad_size * (grad_element_size_byte + lr_element_size_byte);

  return c;
 }
@ -102,10 +111,18 @@ static OpSchema::Cost CostInferenceForSparseAdagrad(
  // (optimistically count sqrt as one flop).
  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
  c.flops = grad_size * 7;
+  auto const& param_element_size_byte =
+      DataTypeToTypeMeta(param.data_type()).itemsize();
+  auto const& moment_element_size_byte =
+      DataTypeToTypeMeta(moment.data_type()).itemsize();
  c.bytes_written =
-      grad_size * (sizeof(param.data_type()) + sizeof(moment.data_type()));
-  c.bytes_read = c.bytes_written + grad_size * sizeof(grad.data_type()) +
-      n * sizeof(indices.data_type());
+      grad_size * (param_element_size_byte + moment_element_size_byte);
+  auto const& grad_element_size_byte =
+      DataTypeToTypeMeta(grad.data_type()).itemsize();
+  auto const& indices_element_size_byte =
+      DataTypeToTypeMeta(indices.data_type()).itemsize();
+  c.bytes_read = c.bytes_written + grad_size * grad_element_size_byte +
+      n * indices_element_size_byte;

  return c;
 }
@ -153,6 +170,16 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
  OpSchema::Cost c;

  if (n > 0) {
+    auto const& param_element_size_byte =
+        DataTypeToTypeMeta(param.data_type()).itemsize();
+    auto const& moment_element_size_byte =
+        DataTypeToTypeMeta(moment.data_type()).itemsize();
+    auto const& grad_element_size_byte =
+        DataTypeToTypeMeta(grad.data_type()).itemsize();
+    auto const& indices_element_size_byte =
+        DataTypeToTypeMeta(indices.data_type()).itemsize();
+    auto const& lr_element_size_byte =
+        DataTypeToTypeMeta(lr.data_type()).itemsize();
    auto block_size = grad_size / n;
    if (block_size == 1) {
      // +2: applying weight decay and add to grads
@ -161,22 +188,22 @@ static OpSchema::Cost CostInferenceForRowWiseSparseAdagrad(
      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
      c.flops = n * 9;
      c.bytes_written =
-          n * (sizeof(param.data_type()) + sizeof(moment.data_type()));
+          n * (param_element_size_byte + moment_element_size_byte);
      c.bytes_read = c.bytes_written +
          n *
-              (sizeof(grad.data_type()) + sizeof(indices.data_type()) +
-               sizeof(lr.data_type()));
+              (grad_element_size_byte + indices_element_size_byte +
+               lr_element_size_byte);
    } else {
      // 5 per block (not counting index transforms)
      // 8 for each value of a block
      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
      c.flops = n * (5 + (block_size * 8));
-      c.bytes_written =
-          n * sizeof(moment.data_type()) + n * block_size * (param.data_type());
+      c.bytes_written = n * moment_element_size_byte +
+          n * block_size * param_element_size_byte;

-      c.bytes_read = c.bytes_written + n * (sizeof(lr.data_type())) +
+      c.bytes_read = c.bytes_written + n * lr_element_size_byte +
          2 * n * block_size *
-              (sizeof(grad.data_type()) + sizeof(param.data_type()));
+              (grad_element_size_byte + param_element_size_byte);
    }
  }
  return c;
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -131,6 +131,7 @@ function(caffe2_print_configuration_summary)
  endif()
  message(STATUS "  USE_METAL             : ${USE_METAL}")
  message(STATUS "  USE_PYTORCH_METAL     : ${USE_PYTORCH_METAL}")
+  message(STATUS "  USE_PYTORCH_METAL_EXPORT     : ${USE_PYTORCH_METAL_EXPORT}")
  message(STATUS "  USE_FFTW              : ${USE_FFTW}")
  message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
  message(STATUS "  USE_MKLDNN            : ${USE_MKLDNN}")
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@ -389,6 +389,7 @@ in :func:`torch.nn.utils.parameterize.register_parametrization`.
    :toctree: generated
    :nosignatures:

+    parametrizations.orthogonal
    parametrizations.spectral_norm

 Utility functions to parametrize Tensors on existing Modules.
@ -396,7 +397,7 @@ Note that these functions can be used to parametrize a given Parameter
 or Buffer given a specific function that maps from an input space to the
 parametrized space. They are not parameterizations that would transform
 an object into a parameter. See the
-`Parametrizations <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`__ tutorial
+`Parametrizations tutorial <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_
 for more information on how to implement your own parametrizations.

 .. autosummary::
--- a/docs/source/notes/modules.rst
+++ b/docs/source/notes/modules.rst
@ -117,7 +117,7 @@ multiple modules:

 Note that :class:`~torch.nn.Sequential` automatically feeds the output of the first ``MyLinear`` module as input
 into the :class:`~torch.nn.ReLU`, and the output of that as input into the second ``MyLinear`` module. As
-shown, it is limited to in-order chaining of modules.
+shown, it is limited to in-order chaining of modules with a single input and output.

 In general, it is recommended to define a custom module for anything beyond the simplest use cases, as this gives
 full flexibility on how submodules are used for a module's computation.
@ -258,16 +258,32 @@ It's also easy to move all parameters to a different device or change their prec
   dynamic_net(torch.randn(5, device='cuda', dtype=torch.float64))
   : tensor([6.5166], device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)

-These examples show how elaborate neural networks can be formed through module composition. To allow for
-quick and easy construction of neural networks with minimal boilerplate, PyTorch provides a large library of
-performant modules within the :mod:`torch.nn` namespace that perform computation commonly found within neural
-networks, including pooling, convolutions, loss functions, etc.
+More generally, an arbitrary function can be applied to a module and its submodules recursively by
+using the :func:`~torch.nn.Module.apply` function. For example, to apply custom initialization to parameters
+of a module and its submodules:
+
+.. code-block:: python
+
+   # Define a function to initialize Linear weights.
+   # Note that no_grad() is used here to avoid tracking this computation in the autograd graph.
+   @torch.no_grad()
+   def init_weights(m):
+     if isinstance(m, nn.Linear):
+       nn.init.xavier_normal_(m.weight)
+       m.bias.fill_(0.0)
+
+   # Apply the function recursively on the module and its submodules.
+   dynamic_net.apply(init_weights)
+
+These examples show how elaborate neural networks can be formed through module composition and conveniently
+manipulated. To allow for quick and easy construction of neural networks with minimal boilerplate, PyTorch
+provides a large library of performant modules within the :mod:`torch.nn` namespace that perform common neural
+network operations like pooling, convolutions, loss functions, etc.

 In the next section, we give a full example of training a neural network.

 For more information, check out:

-* Recursively :func:`~torch.nn.Module.apply` a function to a module and its submodules
 * Library of PyTorch-provided modules: `torch.nn <https://pytorch.org/docs/stable/nn.html>`_
 * Defining neural net modules: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_module.html

@ -295,6 +311,12 @@ Optimizers from :mod:`torch.optim`:
     loss.backward()
     optimizer.step()

+   # After training, switch the module to eval mode to do inference, compute performance metrics, etc.
+   # (see discussion below for a description of training and evaluation modes)
+   ...
+   net.eval()
+   ...
+
 In this simplified example, the network learns to simply output zero, as any non-zero output is "penalized" according
 to its absolute value by employing :func:`torch.abs` as a loss function. While this is not a very interesting task, the
 key parts of training are present:
@ -321,6 +343,38 @@ value of ``l1``\ 's ``weight`` parameter shows that its values are now much clos
           [ 0.0030],
           [-0.0008]], requires_grad=True)

+Note that the above process is done entirely while the network module is in "training mode". Modules default to
+training mode and can be switched between training and evaluation modes using :func:`~torch.nn.Module.train` and
+:func:`~torch.nn.Module.eval`. They can behave differently depending on which mode they are in. For example, the
+:class:`~torch.nn.BatchNorm` module maintains a running mean and variance during training that are not updated
+when the module is in evaluation mode. In general, modules should be in training mode during training
+and only switched to evaluation mode for inference or evaluation. Below is an example of a custom module
+that behaves differently between the two modes:
+
+.. code-block:: python
+
+   class ModalModule(nn.Module):
+     def __init__(self):
+       super().__init__()
+
+     def forward(self, x):
+       if self.training:
+         # Add a constant only in training mode.
+         return x + 1.
+       else:
+         return x
+
+
+   m = ModalModule()
+   x = torch.randn(4)
+
+   print('training mode output: {}'.format(m(x)))
+   : tensor([1.6614, 1.2669, 1.0617, 1.6213, 0.5481])
+
+   m.eval()
+   print('evaluation mode output: {}'.format(m(x)))
+   : tensor([ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519])
+
 Training neural networks can often be tricky. For more information, check out:

 * Using Optimizers: https://pytorch.org/tutorials/beginner/examples_nn/two_layer_net_optim.html.
@ -409,12 +463,127 @@ Both persistent and non-persistent buffers are affected by model-wide device / d
 Buffers of a module can be iterated over using :func:`~torch.nn.Module.buffers` or
 :func:`~torch.nn.Module.named_buffers`.

+.. code-block:: python
+
+   for buffer in m.named_buffers():
+     print(buffer)
+
+The following class demonstrates the various ways of registering parameters and buffers within a module:
+
+.. code-block:: python
+
+   class StatefulModule(nn.Module):
+     def __init__(self):
+       super().__init__()
+       # Setting a nn.Parameter as an attribute of the module automatically registers the tensor
+       # as a parameter of the module.
+       self.param1 = nn.Parameter(torch.randn(2))
+
+       # Alternative string-based way to register a parameter.
+       self.register_parameter('param2', nn.Parameter(torch.randn(3)))
+
+       # Reserves the "param3" attribute as a parameter, preventing it from being set to anything
+       # except a parameter. "None" entries like this will not be present in the module's state_dict.
+       self.register_parameter('param3', None)
+
+       # Registers a list of parameters.
+       self.param_list = nn.ParameterList([nn.Parameter(torch.randn(2)) for i in range(3)])
+
+       # Registers a dictionary of parameters.
+       self.param_dict = nn.ParameterDict({
+         'foo': nn.Parameter(torch.randn(3)),
+         'bar': nn.Parameter(torch.randn(4))
+       })
+
+       # Registers a persistent buffer (one that appears in the module's state_dict).
+       self.register_buffer('buffer1', torch.randn(4), persistent=True)
+
+       # Registers a non-persistent buffer (one that does not appear in the module's state_dict).
+       self.register_buffer('buffer2', torch.randn(5), persistent=False)
+
+       # Reserves the "buffer3" attribute as a buffer, preventing it from being set to anything
+       # except a buffer. "None" entries like this will not be present in the module's state_dict.
+       self.register_buffer('buffer3', None)
+
+       # Adding a submodule registers its parameters as parameters of the module.
+       self.linear = nn.Linear(2, 3)
+
+   m = StatefulModule()
+
+   # Save and load state_dict.
+   torch.save(m.state_dict(), 'state.pt')
+   m_loaded = StatefulModule()
+   m_loaded.load_state_dict(torch.load('state.pt'))
+
+   # Note that non-persistent buffer "buffer2" and reserved attributes "param3" and "buffer3" do
+   # not appear in the state_dict.
+   print(m_loaded.state_dict())
+   : OrderedDict([('param1', tensor([-0.0322,  0.9066])),
+                  ('param2', tensor([-0.4472,  0.1409,  0.4852])),
+                  ('buffer1', tensor([ 0.6949, -0.1944,  1.2911, -2.1044])),
+                  ('param_list.0', tensor([ 0.4202, -0.1953])),
+                  ('param_list.1', tensor([ 1.5299, -0.8747])),
+                  ('param_list.2', tensor([-1.6289,  1.4898])),
+                  ('param_dict.bar', tensor([-0.6434,  1.5187,  0.0346, -0.4077])),
+                  ('param_dict.foo', tensor([-0.0845, -1.4324,  0.7022])),
+                  ('linear.weight', tensor([[-0.3915, -0.6176],
+                                            [ 0.6062, -0.5992],
+                                            [ 0.4452, -0.2843]])),
+                  ('linear.bias', tensor([-0.3710, -0.0795, -0.3947]))])
+
 For more information, check out:

 * Saving and loading: https://pytorch.org/tutorials/beginner/saving_loading_models.html
 * Serialization semantics: https://pytorch.org/docs/master/notes/serialization.html
 * What is a state dict? https://pytorch.org/tutorials/recipes/recipes/what_is_state_dict.html

+Module Initialization
+---------------------
+
+By default, parameters and floating-point buffers for modules provided by :mod:`torch.nn` are initialized during
+module instantiation as 32-bit floating point values on the CPU using an initialization scheme determined to
+perform well historically for the module type. For certain use cases, it may be desired to initialize with a different
+dtype, device (e.g. GPU), or initialization technique.
+
+Examples:
+
+.. code-block:: python
+
+   # Initialize module directly onto GPU.
+   m = nn.Linear(5, 3, device='cuda')
+
+   # Initialize module with 16-bit floating point parameters.
+   m = nn.Linear(5, 3, dtype=torch.half)
+
+   # Skip default parameter initialization and perform custom (e.g. orthogonal) initialization.
+   m = torch.nn.utils.skip_init(nn.Linear, 5, 3)
+   nn.init.orthogonal_(m.weight)
+
+Note that the device and dtype options demonstrated above also apply to any floating-point buffers registered
+for the module:
+
+.. code-block:: python
+
+   m = nn.BatchNorm2d(3, dtype=torch.half)
+   print(m.running_mean)
+   : tensor([0., 0., 0.], dtype=torch.float16)
+
+While module writers can use any device or dtype to initialize parameters in their custom modules, good practice is
+to use ``dtype=torch.float`` and ``device='cpu'`` by default as well. Optionally, you can provide full flexibility
+in these areas for your custom module by conforming to the convention demonstrated above that all
+:mod:`torch.nn` modules follow:
+
+* Provide a ``device`` constructor kwarg that applies to any parameters / buffers registered by the module.
+* Provide a ``dtype`` constructor kwarg that applies to any parameters / floating-point buffers registered by
+  the module.
+* Only use initialization functions (i.e. functions from :mod:`torch.nn.init`) on parameters and buffers within the
+  module's constructor. Note that this is only required to use :func:`~torch.nn.utils.skip_init`; see
+  `this page <https://pytorch.org/tutorials/prototype/skip_param_init.html#updating-modules-to-support-skipping-initialization>`_ for an explanation.
+
+For more information, check out:
+
+* Skipping module parameter initialization: https://pytorch.org/tutorials/prototype/skip_param_init.html
+
 Module Hooks
 ------------

@ -443,16 +612,137 @@ All hooks allow the user to return an updated value that will be used throughout
 Thus, these hooks can be used to either execute arbitrary code along the regular module forward/backward or
 modify some inputs/outputs without having to change the module's ``forward()`` function.

+Below is an example demonstrating usage of forward and backward hooks:
+
+.. code-block:: python
+
+   torch.manual_seed(1)
+
+   def forward_pre_hook(m, inputs):
+     # Allows for examination and modification of the input before the forward pass.
+     # Note that inputs are always wrapped in a tuple.
+     input = inputs[0]
+     return input + 1.
+
+   def forward_hook(m, inputs, output):
+     # Allows for examination of inputs / outputs and modification of the outputs
+     # after the forward pass. Note that inputs are always wrapped in a tuple while outputs
+     # are passed as-is.
+
+     # Residual computation a la ResNet.
+     return output + inputs[0]
+
+   def backward_hook(m, grad_inputs, grad_outputs):
+     # Allows for examination of grad_inputs / grad_outputs and modification of
+     # grad_inputs used in the rest of the backwards pass. Note that grad_inputs and
+     # grad_outputs are always wrapped in tuples.
+     new_grad_inputs = [torch.ones_like(gi) * 42. for gi in grad_inputs]
+     return new_grad_inputs
+
+   # Create sample module & input.
+   m = nn.Linear(3, 3)
+   x = torch.randn(2, 3, requires_grad=True)
+
+   # ==== Demonstrate forward hooks. ====
+   # Run input through module before and after adding hooks.
+   print('output with no forward hooks: {}'.format(m(x)))
+   : output with no forward hooks: tensor([[-0.5059, -0.8158,  0.2390],
+                                           [-0.0043,  0.4724, -0.1714]], grad_fn=<AddmmBackward>)
+
+   # Note that the modified input results in a different output.
+   forward_pre_hook_handle = m.register_forward_pre_hook(forward_pre_hook)
+   print('output with forward pre hook: {}'.format(m(x)))
+   : output with forward pre hook: tensor([[-0.5752, -0.7421,  0.4942],
+                                           [-0.0736,  0.5461,  0.0838]], grad_fn=<AddmmBackward>)
+
+   # Note the modified output.
+   forward_hook_handle = m.register_forward_hook(forward_hook)
+   print('output with both forward hooks: {}'.format(m(x)))
+   : output with both forward hooks: tensor([[-1.0980,  0.6396,  0.4666],
+                                             [ 0.3634,  0.6538,  1.0256]], grad_fn=<AddBackward0>)
+
+   # Remove hooks; note that the output here matches the output before adding hooks.
+   forward_pre_hook_handle.remove()
+   forward_hook_handle.remove()
+   print('output after removing forward hooks: {}'.format(m(x)))
+   : output after removing forward hooks: tensor([[-0.5059, -0.8158,  0.2390],
+                                                  [-0.0043,  0.4724, -0.1714]], grad_fn=<AddmmBackward>)
+
+   # ==== Demonstrate backward hooks. ====
+   m(x).sum().backward()
+   print('x.grad with no backwards hook: {}'.format(x.grad))
+   : x.grad with no backwards hook: tensor([[ 0.4497, -0.5046,  0.3146],
+                                            [ 0.4497, -0.5046,  0.3146]])
+
+   # Clear gradients before running backward pass again.
+   m.zero_grad()
+   x.grad.zero_()
+
+   m.register_full_backward_hook(backward_hook)
+   m(x).sum().backward()
+   print('x.grad with backwards hook: {}'.format(x.grad))
+   : x.grad with backwards hook: tensor([[42., 42., 42.],
+                                         [42., 42., 42.]])
+
 Advanced Features
 -----------------

 PyTorch also provides several more advanced features that are designed to work with modules. All these functionalities
-are "inherited" when writing a new module. In-depth discussion of these features can be found in the links below.
+are available for custom-written modules, with the small caveat that certain features may require modules to conform
+to particular constraints in order to be supported. In-depth discussion of these features and the corresponding
+requirements can be found in the links below.

-For more information, check out:
+Distributed Training
+********************

-* Profiling: https://pytorch.org/tutorials/beginner/profiler.html
-* Pruning: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html
-* Quantization: https://pytorch.org/tutorials/recipes/quantization.html
-* Exporting modules to TorchScript (e.g. for usage from C++):
-  https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html
+Various methods for distributed training exist within PyTorch, both for scaling up training using multiple GPUs
+as well as training across multiple machines. Check out the
+`distributed training overview page <https://pytorch.org/tutorials/beginner/dist_overview.html>`_ for
+detailed information on how to utilize these.
+
+Profiling Performance
+*********************
+
+The `PyTorch Profiler <https://pytorch.org/tutorials/beginner/profiler.html>`_ can be useful for identifying
+performance bottlenecks within your models. It measures and outputs performance characteristics for
+both memory usage and time spent.
+
+Improving Performance with Quantization
+***************************************
+
+Applying quantization techniques to modules can improve performance and memory usage by utilizing lower
+bitwidths than floating-point precision. Check out the various PyTorch-provided mechanisms for quantization
+`here <https://pytorch.org/docs/stable/quantization.html>`_.
+
+Improving Memory Usage with Pruning
+***********************************
+
+Large deep learning models are often over-parametrized, resulting in high memory usage. To combat this, PyTorch
+provides mechanisms for model pruning, which can help reduce memory usage while maintaining task accuracy. The
+`Pruning tutorial <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html>`_ describes how to utilize
+the pruning techniques PyTorch provides or define custom pruning techniques as necessary.
+
+Deploying with TorchScript
+**************************
+
+When deploying a model for use in production, the overhead of Python can be unacceptable due to its poor
+performance characteristics. For cases like this,
+`TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ provides a way to load
+and run an optimized model program from outside of Python, such as within a C++ program.
+
+Parametrizations
+****************
+
+For certain applications, it can be beneficial to constrain the parameter space during model training. For example,
+enforcing orthogonality of the learned parameters can improve convergence for RNNs. PyTorch provides a mechanism for
+applying `parametrizations <https://pytorch.org/tutorials/intermediate/parametrizations.html>`_ such as this, and
+further allows for custom constraints to be defined.
+
+Transforming Modules with FX
+****************************
+
+The `FX <https://pytorch.org/docs/stable/fx.html>`_ component of PyTorch provides a flexible way to transform
+modules by operating directly on module computation graphs. This can be used to programmatically generate or
+manipulate modules for a broad array of use cases. To explore FX, check out these examples of using FX for
+`convolution + batch norm fusion <https://pytorch.org/tutorials/intermediate/fx_conv_bn_fuser.html>`_ and
+`CPU performance analysis <https://pytorch.org/tutorials/intermediate/fx_profiling_tutorial.html>`_.
--- a/docs/source/special.rst
+++ b/docs/source/special.rst
@ -6,10 +6,6 @@ torch.special

 The torch.special module, modeled after SciPy's `special <https://docs.scipy.org/doc/scipy/reference/special.html>`_ module.

-This module is in BETA. New functions are still being added, and some
-functions may change in future PyTorch releases. See the documentation of each
-function for details.
-
 .. automodule:: torch.special
    :noindex:

--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@ -9,3 +9,4 @@ torch.testing
 .. automodule:: torch.testing

 .. autofunction:: assert_close
+.. autofunction:: make_tensor
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@ -1,6 +1,8 @@
 #include <gtest/gtest.h>
+#include <ATen/core/boxing/impl/test_helpers.h>

 #include <torch/torch.h>
+#include <ATen/core/op_registration/op_registration.h>

 #include <torch/csrc/autograd/functions/basic_ops.h>

@ -869,6 +871,261 @@ TEST(CustomAutogradTest, BackwardWithCreateGraphWarns) {
  }
 }

+/**
+ * Tests for AutogradNotImplementedFallback
+ * - Check that we created the NotImplemented kernel when inputs require grad
+ *   but when no inputs require grad, we should not create this node
+ * - check_inplace logic
+ * - view ops (TODO: not an official view yet, update this once InplaceOrView kernel is landed)
+ * - TODO: Tests for NDEBUG checks?
+ * - tensorlist input and output
+ * - multiple outputs / non-tensor output
+ * - rebase_history vs set_history
+ */
+namespace {
+
+torch::Tensor inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
+  return self.add_(other);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> two_arg_inplace_op(const torch::Tensor& self, const torch::Tensor& other) {
+  other.add_(self);
+  self.add_(other);
+  return std::tuple<torch::Tensor, torch::Tensor>(self, other);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> two_pairs_of_view_op(const torch::Tensor& self, const torch::Tensor& other) {
+  // This is not allowed. We test below that this calling into the boxed kernel will raise an error
+  auto self_view = self.view(-1);
+  auto other_view = other.view(-1);
+  return std::tuple<torch::Tensor, torch::Tensor>(self_view, other_view);
+}
+
+int64_t ret_single_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
+  return 12;
+}
+
+torch::Tensor opt_op(const torch::Tensor& self, const c10::optional<at::Tensor>& other) {
+  if (other.has_value()) {
+      return self + other.value();
+  } else {
+      return self.clone();
+  }
+}
+
+torch::Tensor my_custom_op(const torch::Tensor& self, const torch::Tensor& other) {
+  return self + other;
+}
+
+std::tuple<torch::Tensor, torch::Tensor, int64_t> ret_tuple_non_tensor(const torch::Tensor& self, const torch::Tensor& other) {
+  auto a = self - other;
+  auto b = self + other;
+  return std::tuple<torch::Tensor, torch::Tensor, int64_t>(a, b, 12);
+}
+
+torch::Tensor view_op(const torch::Tensor& self, const torch::Tensor& other) {
+  return self.view(-1);
+}
+
+std::vector<at::Tensor> ret_tensor_vector(const torch::Tensor& self, const torch::Tensor& other) {
+  std::vector<at::Tensor> out;
+  out.push_back(self + other);
+  out.push_back(self - other);
+  return out;
+}
+
+torch::Tensor tensorlist_op(const torch::Tensor& self, at::TensorList other) {
+  const auto& res = self.clone();
+  for (const auto& t : other) {
+      res.add_(t);
+  }
+  return res;
+}
+
+#define REGISTER_TEST_OP(name, schema, fn)                                               \
+  auto m = MAKE_TORCH_LIBRARY(_test);                                                    \
+  m.def(schema);                                                                         \
+  auto m_autograd = MAKE_TORCH_LIBRARY_IMPL(_test, Autograd);                            \
+  auto m_cpu = MAKE_TORCH_LIBRARY_IMPL(_test, CPU);                                      \
+  m_cpu.impl(name, c10::DispatchKey::CPU, TORCH_FN(fn));                                 \
+  m_autograd.impl(name, c10::DispatchKey::Autograd, autogradNotImplementedFallback());
+
+template <typename F>
+void assertBasicChecks(F op) {
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+  auto c = torch::tensor({1.}, {torch::kFloat32});
+
+  // If any inputs require grad,
+  auto out1 = op(a, b);
+  ASSERT_THROWS_WITH(out1.backward(), "is not implemented");
+
+  // # Should not have grad_fn if none require grad
+  auto out2 = op(b, c);
+  ASSERT_THROWS_WITH(out2.backward(), "element 0 of tensors does not require grad and does not have a grad_fn");
+
+  // TODO: Forward AD Tests?
+}
+
+} // namespace
+
+TEST(TestAutogradNotImplementedFallback, RetSingleNonTensor) {
+  REGISTER_TEST_OP("ret_single_non_tensor", "_test::ret_single_non_tensor(Tensor self, Tensor other) -> int", ret_single_non_tensor);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_single_non_tensor", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<int64_t, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  ASSERT_EQ(op(a, b), ret_single_non_tensor(a, b));
+}
+
+TEST(TestAutogradNotImplementedFallback, DoubleViewOP) {
+  REGISTER_TEST_OP("two_pairs_of_view_op", "_test::two_pairs_of_view_op(Tensor(a) self, Tensor(b) other) -> (Tensor(a), Tensor(b))", two_pairs_of_view_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_pairs_of_view_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+  ASSERT_THROWS_WITH(op(a, b),
+    "Expected only a single output in the operator schema to have a non-write alias annotation");
+}
+
+TEST(TestAutogradNotImplementedFallback, InplaceOp) {
+  REGISTER_TEST_OP("inplace_op", "_test::inplace_op(Tensor(a!) self, Tensor other) -> Tensor(a!)", inplace_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::inplace_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  // Check in-place
+  ASSERT_THROWS_WITH(op(a, b),
+    "a leaf Variable that requires grad is being used in an in-place operation");
+  op(b, a);
+  a = a.clone();
+  b = b.clone();
+  auto c = op(a, b);
+  ASSERT_TRUE(torch::allclose(c, inplace_op(a, b)));
+
+  // Test in-place on view
+  auto base = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true).clone();
+  auto view = base.view(-1);
+  auto t = torch::tensor({1.}, {torch::kFloat32});
+
+  torch::Tensor v_nograd;
+  {
+    c10::NoGradGuard guard;
+    v_nograd = base.view(-1);
+    op(v_nograd, t);
+  }
+
+  ASSERT_THROWS_WITH(op(v_nograd, t), "A view was created in no_grad mode");
+  ASSERT_EQ(op(view, t).unsafeGetTensorImpl(), view.unsafeGetTensorImpl());
+
+  // TODO: once we have InplaceOrView kernel, renable this since version counter would actually
+  // be incremented
+  // ASSERT_THAT(op(view, t).grad_fn()->name(), ::testing::HasSubstr("AsStridedBackward"));
+}
+
+TEST(TestAutogradNotImplementedFallback, DoubleInplaceOp) {
+  REGISTER_TEST_OP("two_arg_inplace_op", "_test::two_arg_inplace_op(Tensor(a!) self, Tensor(b!) other) -> (Tensor(a!), Tensor(b!))", two_arg_inplace_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::two_arg_inplace_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  // Both are modified in-place!
+  ASSERT_THROWS_WITH(op(a, b),
+    "a leaf Variable that requires grad is being used in an in-place operation");
+  ASSERT_THROWS_WITH(op(b, a),
+    "a leaf Variable that requires grad is being used in an in-place operation");
+}
+
+TEST(TestAutogradNotImplementedFallback, OptOp) {
+  REGISTER_TEST_OP("opt_op", "_test::opt_op(Tensor self, Tensor? other) -> Tensor", opt_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::opt_op", "");
+  auto op = [&](const torch::Tensor& _1, const c10::optional<torch::Tensor>& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const c10::optional<torch::Tensor>&>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+
+  ASSERT_TRUE(torch::allclose(op(a, b), opt_op(a, b)));
+  ASSERT_TRUE(torch::allclose(op(a, {}), opt_op(a, {})));
+}
+
+TEST(TestAutogradNotImplementedFallback, OutOfPlaceAddition) {
+  REGISTER_TEST_OP("my_custom_op", "_test::my_custom_op(Tensor self, Tensor other) -> Tensor", my_custom_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::my_custom_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, RetTupleNonTensor) {
+  REGISTER_TEST_OP("ret_tuple_non_tensor", "_test::ret_tuple_non_tensor(Tensor self, Tensor other) -> (Tensor, Tensor, int)", ret_tuple_non_tensor);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tuple_non_tensor", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    torch::Tensor out0;
+    torch::Tensor out1;
+    int64_t out2;
+    auto out = callOpUnboxed<std::tuple<torch::Tensor, torch::Tensor, int64_t>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+    std::tie(out0, out1, out2) = std::move(out);
+    return out0;
+  };
+
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, ViewOp) {
+  REGISTER_TEST_OP("view_op", "_test::view_op(Tensor(a) self, Tensor other) -> Tensor(a)", view_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::view_op", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2);
+  };
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, RetTensorVector) {
+  REGISTER_TEST_OP("ret_tensor_vector", "_test::ret_tensor_vector(Tensor self, Tensor other) -> Tensor[]", ret_tensor_vector);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::ret_tensor_vector", "");
+  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
+    return callOpUnboxed<std::vector<at::Tensor>, const torch::Tensor&, const torch::Tensor&>(opHandle, _1, _2)[0];
+  };
+  assertBasicChecks(op);
+}
+
+TEST(TestAutogradNotImplementedFallback, TensorlistOp) {
+  REGISTER_TEST_OP("tensorlist_op", "_test::tensorlist_op(Tensor self, Tensor[] other) -> Tensor", tensorlist_op);
+  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow("_test::tensorlist_op", "");
+  auto op = [&](torch::Tensor _1, at::TensorList _2) {
+    return callOpUnboxed<torch::Tensor, const torch::Tensor&, at::TensorList>(opHandle, _1, _2);
+  };
+
+  auto a = torch::tensor({1.}, {torch::kFloat32});
+  auto b = torch::tensor({1.}, {torch::kFloat32});
+  auto c = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
+  std::vector<torch::Tensor> vec = {b, c};
+  auto out = op(a, vec);
+
+  ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[0]}), "One of the differentiated Tensors does not require grad");
+  ASSERT_THROWS_WITH(torch::autograd::grad({out}, {vec[1]}), "is not implemented");
+
+  ASSERT_TRUE(at::allclose(op(a, vec), tensorlist_op(a, vec)));
+}
+
+
 // TODO add these tests if needed
 // test_once_differentiable
 // test_sparse_backward
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@ -792,6 +792,20 @@ TEST_F(FunctionalTest, CrossEntropy) {

  ASSERT_TRUE(output.allclose(expected, 1e-04));
  ASSERT_TRUE(F::cross_entropy(input, target).allclose(expected, 1e-04));
+
+  // label smoothing with class indices
+  input = torch::tensor({{3., 1.}, {1., 2.}}, torch::kFloat);
+  output = F::cross_entropy(
+      input, target, F::CrossEntropyFuncOptions().label_smoothing(0.15).reduction(torch::kMean));
+  expected = torch::tensor(0.3326, torch::kFloat);
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+
+  // label smoothing with target probabilities
+  target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
+  output = F::cross_entropy(
+      input, target, F::CrossEntropyFuncOptions().label_smoothing(0.2).reduction(torch::kMean));
+  expected = torch::tensor(0.5701, torch::kFloat);
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
 }

 TEST_F(FunctionalTest, MaxUnpool1d) {
--- a/test/cpp/api/imethod.cpp
+++ b/test/cpp/api/imethod.cpp
@ -28,6 +28,9 @@ TEST(IMethodTest, CallMethod) {
  auto pyModel = package.load_pickle("model", "model.pkl");
  torch::deploy::PythonMethodWrapper pyMethod(pyModel, "forward");

+  EXPECT_EQ(scriptMethod.name(), "forward");
+  EXPECT_EQ(pyMethod.name(), "forward");
+
  auto input = torch::ones({10, 20});
  auto outputPy = pyMethod({input});
  auto outputScript = scriptMethod({input});
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@ -2315,6 +2315,31 @@ TEST_F(ModulesTest, CrossEntropyLoss) {
  ASSERT_TRUE(
    CrossEntropyLoss(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean))
      ->forward(input, target).allclose(expected, 1e-04));
+
+  // label smoothing with class indices
+  loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.15).reduction(torch::kMean));
+  input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
+  target = torch::tensor({0, 1}, torch::kLong);
+  output = loss->forward(input, target);
+  expected = torch::tensor(0.3326, torch::kFloat);
+  s = output.sum();
+  s.backward();
+
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+  ASSERT_EQ(input.sizes(), input.grad().sizes());
+
+  // label smoothing with with target probabilities
+  loss = CrossEntropyLoss(CrossEntropyLossOptions().label_smoothing(0.2).reduction(torch::kMean));
+  input = torch::tensor({{3., 1.}, {1., 2.}}, torch::dtype(torch::kFloat).requires_grad(true));
+  target = torch::tensor({{0.8, 0.2}, {0.1, 0.9}}, torch::kFloat);
+  output = loss->forward(input, target);
+  expected = torch::tensor(0.5701, torch::kFloat);
+  s = output.sum();
+  s.backward();
+
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+  ASSERT_EQ(input.sizes(), input.grad().sizes());
+
 }

 TEST_F(ModulesTest, CosineSimilarity) {
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@ -1,11 +1,11 @@
 #include <gtest/gtest.h>

 #include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/irparser.h>
-#include "torch/csrc/jit/frontend/ir_emitter.h"
-#include "torch/csrc/jit/ir/alias_analysis.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/utils/memory.h"
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/utils/memory.h>

 namespace torch {
 namespace jit {
@ -484,7 +484,7 @@ TEST(AliasAnalysisTest, SafeToChangeAliasingRelationship) {
 TEST(WriteTrackingTest, Basic) {
  RegisterOperators reg({Operator(
      "prim::creates_alias(Tensor(a) x) -> Tensor(a)",
-      [](Stack* s) {},
+      [](Stack&) {},
      aliasAnalysisFromSchema())});
  const auto creates_alias = Symbol::fromQualString("prim::creates_alias");
  auto graph = std::make_shared<Graph>();
@ -949,11 +949,11 @@ TEST(WildcardsTest, Basic) {
  RegisterOperators reg(
      {Operator(
           "prim::returns_wildcard(Tensor a) -> Tensor(*)",
-           [](Stack* stack) {},
+           [](Stack&) {},
           aliasAnalysisFromSchema()),
       Operator(
           "prim::writes(Tensor(z!) a) -> Tensor(a)",
-           [](Stack* stack) {},
+           [](Stack&) {},
           aliasAnalysisFromSchema())});
  const auto returns_wildcard =
      Symbol::fromQualString("prim::returns_wildcard");
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@ -31,7 +31,7 @@ TEST(CustomOperatorTest, InferredSchema) {

  Stack stack;
  push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
  at::Tensor output;
  pop(stack, output);

@ -61,7 +61,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {

  Stack stack;
  push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
  at::Tensor output;
  pop(stack, output);

@ -109,7 +109,7 @@ TEST(CustomOperatorTest, ListParameters) {
      c10::List<c10::complex<double>>(
          {c10::complex<double>(2.4, -5.5), c10::complex<double>(-1.3, 2)}));
  push(stack, c10::List<at::Tensor>({at::ones(5)}));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
  c10::List<double> output;
  pop(stack, output);

@ -140,7 +140,7 @@ TEST(CustomOperatorTest, ListParameters2) {

  Stack stack;
  push(stack, c10::List<at::Tensor>({at::ones(5)}));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
  c10::List<at::Tensor> output;
  pop(stack, output);

@ -204,7 +204,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
  torch::jit::RegisterOperators reg({OperatorGenerator(
      TORCH_SELECTIVE_NAME_IN_SCHEMA(
          op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
-      [](Stack* stack) {
+      [](Stack& stack) {
        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
        double a;
        at::Tensor b;
@ -223,7 +223,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
  torch::jit::RegisterOperators reg({OperatorGenerator(
      TORCH_SELECTIVE_NAME_IN_SCHEMA(
          op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
-      [](Stack* stack) {
+      [](Stack& stack) {
        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
        double a;
        at::Tensor b;
@ -249,7 +249,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {

  Stack stack;
  push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
+  op->getOperation()(stack);
  at::Tensor output;
  pop(stack, output);

--- a/test/cpp/jit/test_interpreter.cpp
+++ b/test/cpp/jit/test_interpreter.cpp
@ -175,6 +175,15 @@ TEST(InterpreterTest, IgnorableArgsInSchema) {
  ASSERT_TRUE(op_to_specified_args_non_const["aten::conv2d"] == 6);
 }

+TEST(InterpreterTest, IgnorableArgsInSchemaWithOut) {
+  auto graph = build_mobile_export_with_out();
+  MobileCode function(graph, "");
+  auto op_to_specified_args = function.op_to_num_specified_args();
+  ASSERT_TRUE(op_to_specified_args.size() == 1);
+  // this should be 3 when the add_out flag is set to True
+  ASSERT_TRUE(op_to_specified_args["aten::add.out"] == 4);
+}
+
 TEST(InterpreterTest, runAsyncBasicTest) {
  /*
  TODO: there are some problem with C++ parsing script program involving
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@ -520,6 +520,28 @@ TEST(SchemaParserTest, NestedArrays) {
                                              .getElementType()));
 }

+TEST(SchemaParserTest, OutVariant) {
+  auto schema_with_out = parseSchema(
+      "at::foo(Tensor self, *, Tensor(a!) f, Tensor(b!) l) -> (Tensor(a!) f, Tensor(b!) l)");
+  ASSERT_TRUE(schema_with_out.arguments().at(1).is_out());
+  ASSERT_TRUE(schema_with_out.arguments().at(2).is_out());
+
+  auto schema_without_out =
+      parseSchema("at::foo(Tensor self, *, int scalar) -> (int)");
+
+  for (const auto& arg : schema_without_out.arguments()) {
+    ASSERT_TRUE(!arg.is_out());
+  }
+
+  auto schema_with_is_write = parseSchema(
+      "aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> (Tensor(a!))");
+
+  for (const auto& arg : schema_with_is_write.arguments()) {
+    ASSERT_TRUE(!arg.is_out());
+  }
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(SchemaParserTest, NamedReturns) {
  // named returns
  parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
@ -1471,11 +1493,11 @@ TEST(NoneSchemaMatchTest, Basic) {
  RegisterOperators reg({
      Operator(
          "prim::test_none() -> int?",
-          [](Stack* stack) { push(stack, IValue()); },
+          [](Stack& stack) { push(stack, IValue()); },
          aliasAnalysisFromSchema()),
      Operator(
          "prim::is_none(int? a) -> bool",
-          [](Stack* stack) {
+          [](Stack& stack) {
            IValue a = pop(stack);
            if (a.isNone()) {
              push(stack, true);
--- a/test/cpp/jit/test_schema_matching.cpp
+++ b/test/cpp/jit/test_schema_matching.cpp
@ -15,7 +15,7 @@ TEST(SchemaMatchingTest, VarType) {
  RegisterOperators reg({
      Operator(
          "aten::test_vartype(t[] a, t b) -> (t)",
-          [](Stack* stack) {
+          [](Stack& stack) {
            c10::List<double> list;
            // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
            double a;
@ -54,7 +54,7 @@ TEST(SchemaMatchingTest, VarType2) {
  RegisterOperators reg({
      Operator(
          "aten::test_vartype2(t a, t[] b) -> (t[])",
-          [](Stack* stack) {
+          [](Stack& stack) {
            // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
            double a;
            c10::List<double> list;
--- a/test/cpp/jit/test_utils.cpp
+++ b/test/cpp/jit/test_utils.cpp
@ -123,6 +123,21 @@ std::shared_ptr<Graph> build_mobile_export_analysis_graph() {
  return g;
 }

+std::shared_ptr<Graph> build_mobile_export_with_out() {
+  const auto graph_string = R"IR(
+    graph(%x.1 : Tensor,
+          %y.1 : Tensor):
+      %8 : NoneType = prim::Constant()
+      %6 : int = prim::Constant[value=1]()
+      %7 : Tensor = aten::add(%x.1, %y.1, %6, %y.1)
+      return (%8))IR";
+
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+  g->lint();
+  return g;
+}
+
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested() {
  // this is pretty much same test as build_mobile_export_analysis_graph(),
  // but some aten::slice operators are hidden under block statement to check
@ -258,7 +273,7 @@ RegisterOperators reg({
    // because it always produces empty Tensors.
    Operator(
        "prim::MakeTestTensor() -> Tensor",
-        [](Stack* stack) { push(stack, at::Tensor()); },
+        [](Stack& stack) { push(stack, at::Tensor()); },
        aliasAnalysisFromSchema()),
 });
 } // namespace
--- a/test/cpp/jit/test_utils.h
+++ b/test/cpp/jit/test_utils.h
@ -74,6 +74,7 @@ std::pair<tensor_list, tensor_list> runGradient(

 std::shared_ptr<Graph> build_lstm();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph();
+std::shared_ptr<Graph> build_mobile_export_with_out();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_with_vararg();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_nested();
 std::shared_ptr<Graph> build_mobile_export_analysis_graph_non_const();
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@ -198,6 +198,22 @@ TEST_F(Kernel, _3) {
  }
 }

+TEST_F(Kernel, Huge) {
+  const auto graph_string = R"IR(
+      graph(%x.1 : Float(4000000000, strides=[1], requires_grad=0, device=cpu)):
+        %1 : int = prim::Constant[value=0]()
+        %2 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::unsqueeze(%x.1, %1)
+        %3 : Float(1, 4000000000, strides=[4000000000, 1], requires_grad=0, device=cpu) = aten::relu(%2)
+        return (%3))IR";
+  auto graph = std::make_shared<Graph>();
+  parseIR(graph_string, &*graph);
+  TensorExprKernel k(graph);
+  std::ostringstream oss;
+  oss << *k.getCodeGenStmt();
+  const std::string& verification_pattern = "# CHECK: 4000000000";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
 TEST_F(Kernel, ParallelStrided) {
  const auto graph_string = R"IR(
      graph(%0 : Float(5, 3, 40005, strides=[120015, 40005, 1], device=cpu),
@ -786,9 +802,9 @@ TEST_F(Kernel, SumOneAxis) {
        // Check the IR we produced
        const std::string& verification_pattern =
            R"IR(
-# CHECK: for (int v = 0; v <
+# CHECK: for (int64_t v = 0ll; v <
 # CHECK-NEXT: sum
-# CHECK-NEXT: for (int v_1 = 0; v_1 <
+# CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 <
 # CHECK-NEXT:   sum)IR";
        torch::jit::testing::FileCheck().run(verification_pattern, oss.str());

@ -847,10 +863,10 @@ TEST_F(Kernel, SumMultipleAxes) {
        // Check the IR we produced
        const std::string& verification_pattern =
            R"IR(
-# CHECK: int v = 0
-# CHECK: int v_1 = 0
-# CHECK: int v_2 = 0
-# CHECK: int v_3 = 0
+# CHECK: int64_t v = 0
+# CHECK: int64_t v_1 = 0
+# CHECK: int64_t v_2 = 0
+# CHECK: int64_t v_3 = 0
 # CHECK: sum)IR";
        torch::jit::testing::FileCheck().run(verification_pattern, oss.str());

@ -1115,8 +1131,8 @@ TEST_F(Kernel, InlineProducerIntoReduction) {
  // We should have only one loop in the end.
  const std::string& verification_pattern =
      R"IR(
-        # CHECK: for (int v = 0; v < 5;
-        # CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
+        # CHECK: for (int64_t v = 0ll; v < 5
+        # CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
        # CHECK-NEXT:   sum
        # CHECK-NOT: for)IR";
  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
@ -1154,11 +1170,11 @@ TEST_F(Kernel, InlineReductionIntoConsumer) {
  // We should have two loops in the end.
  const std::string& verification_pattern =
      R"IR(
-        # CHECK: for (int v = 0; v < 5;
-        # CHECK-NEXT: for (int v_1 = 0; v_1 < 3;
+        # CHECK: for (int64_t v = 0ll; v < 5
+        # CHECK-NEXT: for (int64_t v_1 = 0ll; v_1 < 3
        # CHECK-NEXT:   sum
-        # CHECK: for (int v_2 = 0; v_2 < 5;
-        # CHECK-NEXT: for (int v_3 = 0; v_3 < 3;
+        # CHECK: for (int64_t v_2 = 0ll; v_2 < 5
+        # CHECK-NEXT: for (int64_t v_3 = 0ll; v_3 < 3
        # CHECK-NEXT:   aten_mul
        # CHECK-NOT: for)IR";
  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@ -1501,42 +1501,54 @@ TEST(LLVM, RFactorVectorizedReduction) {
  ExpectAllNear(b_v, b_ref, 1e-5);
 }

-TEST(LLVM, SimpleParallel) {
-  for (int test_cfg = 0; test_cfg < 4; test_cfg++) {
-    // Compute a simple operation, and try all loop-axis combination to be
-    // parallel or sequential.
-    const int M = 4;
-    const int N = 6;
-    Tensor f = Compute(
-        "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
-          return cast<float>(m + n);
-        });
-    LoopNest loop_nest({f});
-    auto const& loops = loop_nest.getLoopStmtsFor(f);
-    ForPtr m = loops[0];
-    ForPtr n = loops[1];
-    if (test_cfg & 0x1) {
-      m->set_parallel();
-    }
-    if (test_cfg & 0x2) {
-      n->set_parallel();
-    }
-    loop_nest.prepareForCodegen();
-    StmtPtr stmt = loop_nest.root_stmt();
-    LLVMCodeGen cg(stmt, {f});
-
-    PaddedBuffer<float> f_v(M, N, "f_v");
-    std::vector<void*> args({f_v.data()});
-    int value = cg.value<int>(args);
-    ASSERT_EQ(value, 0);
-    PaddedBuffer<float> f_ref(M, N, "f_ref");
-    for (int m = 0; m < M; m++) {
-      for (int n = 0; n < N; n++) {
-        f_ref(m, n) = m + n;
-      }
-    }
-    ExpectAllNear(f_v, f_ref, 1e-5);
+template <bool outer, bool inner>
+static void testSimpleParallel() {
+  // Compute a simple operation, and try all loop-axis combination to be
+  // parallel or sequential.
+  const int M = 4;
+  const int N = 6;
+  Tensor f = Compute(
+      "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) {
+        return cast<float>(m + n);
+      });
+  LoopNest loop_nest({f});
+  auto const& loops = loop_nest.getLoopStmtsFor(f);
+  ForPtr m = loops[0];
+  ForPtr n = loops[1];
+  if (outer) {
+    m->set_parallel();
  }
+  if (inner) {
+    n->set_parallel();
+  }
+  loop_nest.prepareForCodegen();
+  StmtPtr stmt = loop_nest.root_stmt();
+  LLVMCodeGen cg(stmt, {f});
+
+  PaddedBuffer<float> f_v(M, N, "f_v");
+  std::vector<void*> args({f_v.data()});
+  int value = cg.value<int>(args);
+  ASSERT_EQ(value, 0);
+  PaddedBuffer<float> f_ref(M, N, "f_ref");
+  for (int m = 0; m < M; m++) {
+    for (int n = 0; n < N; n++) {
+      f_ref(m, n) = m + n;
+    }
+  }
+  ExpectAllNear(f_v, f_ref, 1e-5);
+}
+
+TEST(LLVM, SimpleParallelSS) {
+  testSimpleParallel<false, false>();
+}
+TEST(LLVM, SimpleParallelSP) {
+  testSimpleParallel<false, true>();
+}
+TEST(LLVM, SimpleParallelPS) {
+  testSimpleParallel<true, false>();
+}
+TEST(LLVM, SimpleParallelPP) {
+  testSimpleParallel<true, true>();
 }

 TEST(LLVM, CompositeParallel) {
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@ -4734,8 +4734,8 @@ TEST(LoopNest, VectorizeUse) {
 }

 const char* int64Loop = R"IR(
-# CHECK: for (int64_t n = 0; n < 12; n++) {
-# CHECK:   b[n] = (a[n]) + 1;
+# CHECK: for (int64_t n = 0ll; n < 12ll; n++) {
+# CHECK:   b[n] = (a[n]) + 1ll;
 # CHECK: }
 )IR";

@ -4744,7 +4744,8 @@ TEST(LoopNest, Int64Direct) {
  Placeholder a("a", kLong, {N});
  Placeholder b("b", kLong, {N});
  VarHandle n("n", kLong);
-  StmtPtr s = For::make(n, 0, N, b.store({n}, a.load({n}) + LongImm::make(1l)));
+  StmtPtr s = For::make(
+      n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l)));
  s = IRSimplifier::simplify(s);
  std::ostringstream oss;
  oss << *s;
--- a/Show More
+++ b/Show More