[PyTorch] Add codegen unboxing ability (#69881)

Summary: RFC: https://github.com/pytorch/rfcs/pull/40 This PR (re)introduces python codegen for unboxing wrappers. Given an entry of `native_functions.yaml` the codegen should be able to generate the corresponding C++ code to convert ivalues from the stack to their proper types. To trigger the codegen, run ``` tools/jit/gen_unboxing.py -d cg/torch/share/ATen ``` Merged changes on CI test. In https://github.com/pytorch/pytorch/issues/71782 I added an e2e test for static dispatch + codegen unboxing. The test exports a mobile model of mobilenetv2, load and run it on a new binary for lite interpreter: `test/mobile/custom_build/lite_predictor.cpp`. ## Lite predictor build specifics 1. Codegen: `gen.py` generates `RegisterCPU.cpp` and `RegisterSchema.cpp`. Now with this PR, once `static_dispatch` mode is enabled, `gen.py` will not generate `TORCH_LIBRARY` API calls in those cpp files, hence avoids interaction with the dispatcher. Once `USE_LIGHTWEIGHT_DISPATCH` is turned on, `cmake/Codegen.cmake` calls `gen_unboxing.py` which generates `UnboxingFunctions.h`, `UnboxingFunctions_[0-4].cpp` and `RegisterCodegenUnboxedKernels_[0-4].cpp`. 2. Build: `USE_LIGHTWEIGHT_DISPATCH` adds generated sources into `all_cpu_cpp` in `aten/src/ATen/CMakeLists.txt`. All other files remain unchanged. In reality all the `Operators_[0-4].cpp` are not necessary but we can rely on linker to strip them off. ## Current CI job test coverage update Created a new CI job `linux-xenial-py3-clang5-mobile-lightweight-dispatch-build` that enables the following build options: * `USE_LIGHTWEIGHT_DISPATCH=1` * `BUILD_LITE_INTERPRETER=1` * `STATIC_DISPATCH_BACKEND=CPU` This job triggers `test/mobile/lightweight_dispatch/build.sh` and builds `libtorch`. Then the script runs C++ tests written in `test_lightweight_dispatch.cpp` and `test_codegen_unboxing.cpp`. Recent commits added tests to cover as many C++ argument type as possible: in `build.sh` we installed PyTorch Python API so that we can export test models in `tests_setup.py`. Then we run C++ test binary to run these models on lightweight dispatch enabled runtime. Pull Request resolved: https://github.com/pytorch/pytorch/pull/69881 Reviewed By: iseeyuan Differential Revision: D33692299 Pulled By: larryliu0820 fbshipit-source-id: 211e59f2364100703359b4a3d2ab48ca5155a023 (cherry picked from commit 58e1c9a25e3d1b5b656282cf3ac2f548d98d530b)
2025-12-06 12:20:52 +01:00 · 2022-03-01 14:54:42 -08:00 · 2022-03-01 14:54:42 -08:00 · 9ce9803abe
commit 9ce9803abe
parent 6396547f9e
28 changed files with 1351 additions and 31 deletions
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@ -26,6 +26,7 @@
      "linux-xenial-py3.7-clang7-asan",
      "linux-xenial-py3.7-clang7-onnx",
      "linux-xenial-py3.7-gcc5.4",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
      "linux-xenial-py3.7-gcc7",
      "linux-xenial-py3.7-gcc7-no-ops",
      "macos-10-15-py3-arm64",
@ -96,6 +97,7 @@
      "linux-xenial-py3.7-clang7-asan",
      "linux-xenial-py3.7-clang7-onnx",
      "linux-xenial-py3.7-gcc5.4",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
      "linux-xenial-py3.7-gcc7",
      "linux-xenial-py3.7-gcc7-no-ops",
      "parallelnative-linux-xenial-py3.7-gcc5.4",
@ -134,6 +136,7 @@
      "linux-xenial-py3.7-clang7-asan",
      "linux-xenial-py3.7-clang7-onnx",
      "linux-xenial-py3.7-gcc5.4",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
      "linux-xenial-py3.7-gcc7",
      "linux-xenial-py3.7-gcc7-no-ops",
      "macos-arm64-binary-conda",
@ -164,6 +167,7 @@
    "ciflow/libtorch": [
      "libtorch-linux-xenial-cuda10.2-py3.7-gcc7",
      "libtorch-linux-xenial-cuda11.3-py3.7-gcc7",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
      "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7"
    ],
    "ciflow/linux": [
@ -184,6 +188,7 @@
      "linux-xenial-py3.7-clang7-asan",
      "linux-xenial-py3.7-clang7-onnx",
      "linux-xenial-py3.7-gcc5.4",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
      "linux-xenial-py3.7-gcc7",
      "linux-xenial-py3.7-gcc7-no-ops",
      "parallelnative-linux-xenial-py3.7-gcc5.4",
@ -209,7 +214,8 @@
    ],
    "ciflow/mobile": [
      "linux-xenial-py3-clang5-mobile-build",
-      "linux-xenial-py3-clang5-mobile-custom-build-static"
+      "linux-xenial-py3-clang5-mobile-custom-build-static",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
    ],
    "ciflow/noarch": [
      "linux-bionic-py3.7-clang9"
@ -262,6 +268,7 @@
      "linux-xenial-py3.7-clang7-asan",
      "linux-xenial-py3.7-clang7-onnx",
      "linux-xenial-py3.7-gcc5.4",
+      "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
      "linux-xenial-py3.7-gcc7",
      "linux-xenial-py3.7-gcc7-no-ops",
      "macos-10-15-py3-arm64",
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -527,6 +527,17 @@ LINUX_WORKFLOWS = [
            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_MOBILE, LABEL_CIFLOW_DEFAULT},
        ),
    ),
+    CIWorkflow(
+        arch="linux",
+        build_environment="linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        build_generates_artifacts=False,
+        exclude_test=True,
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_MOBILE, LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CPU},
+        ),
+    ),
    CIWorkflow(
        arch="linux",
        build_environment="linux-xenial-py3.7-clang7-asan",
--- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build.yml
+++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build.yml
@ -0,0 +1,243 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build
+
+on:
+  pull_request:
+  push:
+    tags:
+      - 'ciflow/all/*'
+      - 'ciflow/cpu/*'
+      - 'ciflow/libtorch/*'
+      - 'ciflow/linux/*'
+      - 'ciflow/mobile/*'
+      - 'ciflow/trunk/*'
+    branches:
+      - master
+      - main
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  IS_GHA: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  AWS_DEFAULT_REGION: us-east-1
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  PYTORCH_RETRY_TEST_CASES: 1
+concurrency:
+  group: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+
+  build:
+    runs-on: linux.2xlarge
+    timeout-minutes: 240
+    env:
+      JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build-build
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: print labels
+        run: echo "${PR_LABELS}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        env:
+          AWS_RETRY_MODE: standard
+          AWS_MAX_ATTEMPTS: 5
+        run: |
+          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
+              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+      - name: Chown workspace
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE}"
+          mkdir "${GITHUB_WORKSPACE}"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
+          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_SKIP_S3_UPLOAD: 1
+        working-directory: .circleci/docker
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          ./build_docker.sh
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Parse ref
+        shell: bash
+        id: parse-ref
+        run: ./.github/scripts/parse_ref.py
+      - name: Build
+        env:
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e AWS_DEFAULT_REGION \
+            -e IS_GHA \
+            -e PR_NUMBER \
+            -e SHA1 \
+            -e BRANCH \
+            -e GITHUB_RUN_ID \
+            -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          TAG: ${{ steps.parse-ref.outputs.tag }}
+          WORKFLOW_ID: '${{ github.run_id }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests==2.26 boto3==1.16.34
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
--- a/.jenkins/pytorch/build-mobile.sh
+++ b/.jenkins/pytorch/build-mobile.sh
@ -26,6 +26,8 @@ retry pip install --pre torch torchvision \
 # binary, and running forward pass with a real model.
 if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then
  TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh
+elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then
+  test/mobile/lightweight_dispatch/build.sh
 else
  TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh
 fi
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -570,6 +570,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *distributed* || "${JOB_BASE_NAME}" == *distributed* ]]; then
  test_distributed
  test_rpc
+elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
+  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
 else
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -435,8 +435,14 @@ else()
 endif()
 set(SELECTED_OP_LIST "" CACHE STRING
    "Path to the yaml file that contains the list of operators to include for custom build. Include all operators by default.")
-set(STATIC_DISPATCH_BACKEND "" CACHE STRING
-    "Name of the backend for which static dispatch code is generated, e.g.: CPU.")
+option(
+    STATIC_DISPATCH_BACKEND
+    "Name of the backend for which static dispatch code is generated, e.g.: CPU."
+    "")
+option(USE_LIGHTWEIGHT_DISPATCH "Enable codegen unboxing for ATen ops, need to work with static dispatch in order to work properly." OFF)
+if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND)
+  message(FATAL_ERROR "Need to enable static dispatch after enabling USE_LIGHTWEIGHT_DISPATCH.")
+endif()
 option(
  TRACING_BASED
  "Master flag to build Lite Interpreter with tracing build option"
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -162,6 +162,9 @@ else()
  )
 endif()

+if(USE_LIGHTWEIGHT_DISPATCH)
+  set(all_cpu_cpp ${all_cpu_cpp} ${generated_unboxing_sources})
+endif()
 if(AT_MKL_ENABLED)
  set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
--- a/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp
@ -0,0 +1,41 @@
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/register_ops_utils.h>
+
+#include <ATen/UnboxingFunctions.h>
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
+// incremental rebuilds. See the comment at the top of
+// templates/VariableType.cpp for an analogous, in-depth discussion.
+//
+// Generated by tools/jit/gen_unboxing.py. This file registers all ATen ops into JIT op registry instead of c10
+// dispatcher. JIT op registry only takes boxed kernels, so we are calling unboxing functions in UnboxingFunctions.h
+// to cast arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
+
+namespace torch { namespace jit {
+
+using autograd::Variable;
+using autograd::variable_list;
+using at::Scalar;
+using at::ScalarType;
+using at::Tensor;
+using at::TensorOptions;
+using at::DeviceGuard;
+
+using ::c10::fmap;
+using ::c10::filter;
+
+namespace {
+
+RegisterOperators reg({
+
+    // Generated operators
+    ${unboxed_ops}
+});
+
+} // anon namespace
+
+
+}} // namespace torch::jit
--- a/aten/src/ATen/templates/UnboxingFunctions.cpp
+++ b/aten/src/ATen/templates/UnboxingFunctions.cpp
@ -0,0 +1,35 @@
+#include <ATen/UnboxingFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+namespace at {
+namespace unboxing {
+
+using ::c10::fmap;
+using ::c10::filter;
+using torch::jit::peek;
+using torch::jit::drop;
+using torch::jit::pack;
+using torch::jit::pop;
+
+// Generated function declaration
+${definitions}
+
+} // namespace unboxing
+} // namespace at
--- a/aten/src/ATen/templates/UnboxingFunctions.h
+++ b/aten/src/ATen/templates/UnboxingFunctions.h
@ -0,0 +1,32 @@
+// ${generated_comment}
+
+// Generated by tools/jit/gen_unboxing.py. This file declares code generated boxed C++ functions for operators,
+// base off of native_functions.yaml (or similar yaml file with the same syntax). The definition of such a boxed
+// function will pop out IValues from the stack then convert them into the correct C++ types based on given schema. This
+// unboxing logic is an alternative to template-based metaprogramming unboxing.
+
+#pragma once
+
+#include <ATen/ATen.h>
+namespace at {
+namespace unboxing {
+namespace {
+
+template<typename T, size_t N>
+std::array<T, N> as_array(const c10::List<c10::IValue>& list) {
+    std::array<T, N> res;
+    AT_ASSERT(list.size() == N);
+    std::vector<T> vec;
+    for (c10::IValue elem : list) {
+        vec.push_back(elem.to<T>());
+    }
+    std::copy(vec.begin(), vec.end(), res.begin());
+    return res;
+}
+}  // namespace <anonymous>
+using Stack = std::vector<c10::IValue>;
+// Generated function declaration
+${declarations}
+
+} // namespace unboxing
+} // namespace at
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1124,13 +1124,16 @@ endif()
      DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
  endif()

-
  if(BUILD_TEST)
    if(BUILD_LITE_INTERPRETER)
      add_subdirectory(
        ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime
        ${CMAKE_BINARY_DIR}/test_lite_interpreter_runtime
      )
+      add_subdirectory(
+        ${TORCH_ROOT}/test/mobile/lightweight_dispatch
+        ${CMAKE_BINARY_DIR}/test_codegen_unboxing
+      )
    else()
      add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
      add_subdirectory(
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@ -103,6 +103,44 @@ if(INTERN_BUILD_ATEN_OPS)
      --static_dispatch_backend ${STATIC_DISPATCH_BACKEND})
  endif()

+  # Codegen unboxing
+  if(USE_LIGHTWEIGHT_DISPATCH)
+    file(GLOB_RECURSE all_unboxing_script "${CMAKE_CURRENT_LIST_DIR}/../tools/jit/*.py")
+    list(APPEND CUSTOM_BUILD_FLAGS --skip_dispatcher_op_registration)
+    set(GEN_UNBOXING_COMMAND
+        "${PYTHON_EXECUTABLE}" -m tools.jit.gen_unboxing
+        --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
+        --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
+        )
+    set("GEN_UNBOXING_COMMAND_sources"
+        ${GEN_UNBOXING_COMMAND}
+        --output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake
+        )
+    message(STATUS "Generating sources for lightweight dispatch")
+    execute_process(
+        COMMAND ${GEN_UNBOXING_COMMAND_sources} --dry-run
+        RESULT_VARIABLE RETURN_VALUE
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
+    )
+    if(NOT RETURN_VALUE EQUAL 0)
+      message(FATAL_ERROR "Failed to get generated_unboxing_sources list")
+    endif()
+
+    include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake")
+    add_custom_command(
+        COMMENT "Generating ATen unboxing sources"
+        OUTPUT
+        ${generated_unboxing_sources}
+        ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake
+        COMMAND ${GEN_UNBOXING_COMMAND_sources}
+        DEPENDS ${all_unboxing_script} ${sources_templates}
+        ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
+        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
+    )
+  else() # Otherwise do not generate or include sources into build.
+    set(generated_unboxing_sources "")
+  endif()
+
  set(GEN_PER_OPERATOR_FLAG)
  if(USE_PER_OPERATOR_HEADERS)
    list(APPEND GEN_PER_OPERATOR_FLAG "--per-operator-headers")
@ -182,7 +220,7 @@ if(INTERN_BUILD_ATEN_OPS)
  add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS
      ${generated_headers} ${core_generated_headers} ${cpu_vec_generated_headers} ${ops_generated_headers}
      ${generated_sources} ${core_generated_sources} ${cpu_vec_generated_sources} ${ops_generated_sources}
-      ${generated_declarations_yaml})
+      ${generated_declarations_yaml} ${generated_unboxing_sources})
  add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS
      ${cuda_generated_headers} ${cuda_generated_sources})
  add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@ -117,6 +117,13 @@ if [ "${TRACING_BASED}" == 1 ]; then
 else
  CMAKE_ARGS+=("-DTRACING_BASED=OFF")
 fi
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi
+
 CMAKE_ARGS+=("-DBUILD_MOBILE_BENCHMARK=$BUILD_MOBILE_BENCHMARK")
 CMAKE_ARGS+=("-DBUILD_MOBILE_TEST=$BUILD_MOBILE_TEST")
 CMAKE_ARGS+=("-DBUILD_PYTHON=OFF")
--- a/scripts/build_ios.sh
+++ b/scripts/build_ios.sh
@ -88,6 +88,12 @@ if [ "${TRACING_BASED}" == 1 ]; then
 else
  CMAKE_ARGS+=("-DTRACING_BASED=OFF")
 fi
+if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON")
+  CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU")
+else
+  CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF")
+fi

 CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")

--- a/setup.py
+++ b/setup.py
@ -506,6 +506,10 @@ class build_ext(setuptools.command.build_ext.build_ext):
                report('  -- USE_MPI={}'.format(cmake_cache_vars['USE_OPENMPI']))
        else:
            report('-- Building without distributed package')
+        if cmake_cache_vars['STATIC_DISPATCH_BACKEND']:
+            report('-- Using static dispatch with backend {}'.format(cmake_cache_vars['STATIC_DISPATCH_BACKEND']))
+        if cmake_cache_vars['USE_LIGHTWEIGHT_DISPATCH']:
+            report('-- Using lightweight dispatch')

        # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
        # in system CFLAGS
--- a/test/cpp/lite_interpreter_runtime/CMakeLists.txt
+++ b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
@ -23,6 +23,10 @@ target_include_directories(

 target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest backend_with_compiler_runtime)

+if(LINUX)
+  target_link_libraries(test_lite_interpreter_runtime PRIVATE "-Wl,--no-as-needed,$<TARGET_FILE:backend_with_compiler_runtime>,--as-needed")
+endif()
+
 if(INSTALL_TEST)
  install(TARGETS test_lite_interpreter_runtime DESTINATION bin)
  # Install PDB files for MSVC builds
--- a/test/mobile/lightweight_dispatch/CMakeLists.txt
+++ b/test/mobile/lightweight_dispatch/CMakeLists.txt
@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(TORCH_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../..)
+set(TEST_ROOT ${TORCH_ROOT}/test/mobile/lightweight_dispatch)
+
+add_executable(test_codegen_unboxing
+  ${TEST_ROOT}/test_lightweight_dispatch.cpp
+  ${TEST_ROOT}/test_codegen_unboxing.cpp
+)
+
+target_include_directories(test_codegen_unboxing PRIVATE ${ATen_CPU_INCLUDE})
+
+target_compile_definitions(test_codegen_unboxing PRIVATE USE_GTEST)
+
+set(TEST_UNBOXING_DEPENDENCIES torch gtest)
+
+target_link_libraries(test_codegen_unboxing PRIVATE
+  ${TEST_UNBOXING_DEPENDENCIES}
+)
+
+if(INSTALL_TEST)
+  install(TARGETS test_codegen_unboxing DESTINATION bin)
+endif()
--- a/test/mobile/lightweight_dispatch/build.sh
+++ b/test/mobile/lightweight_dispatch/build.sh
@ -0,0 +1,55 @@
+#!/bin/bash
+# This script should be called from .jenkins/pytorch/build.sh. Assuming we are at pytorch source root directory.
+
+# Required environment variable: $BUILD_ENVIRONMENT
+# (This is set by default in the Docker images we build, so you don't
+# need to set it yourself.
+
+set -ex -o pipefail
+
+# shellcheck disable=SC2034
+echo "Build lite interpreter with lightweight dispatch."
+
+CUSTOM_TEST_ARTIFACT_BUILD_DIR=${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../}
+mkdir -pv "${CUSTOM_TEST_ARTIFACT_BUILD_DIR}"
+
+BUILD_LIBTORCH_PY="$PWD/tools/build_libtorch.py"
+TEST_SRC_ROOT="$PWD/test/mobile/lightweight_dispatch"
+
+pushd "$CUSTOM_TEST_ARTIFACT_BUILD_DIR"
+
+# prepare test
+python "$TEST_SRC_ROOT/tests_setup.py" setup
+
+export USE_DISTRIBUTED=0
+export USE_LIGHTWEIGHT_DISPATCH=1
+export STATIC_DISPATCH_BACKEND="CPU"
+export BUILD_LITE_INTERPRETER=1
+
+python "${BUILD_LIBTORCH_PY}"
+ret=$?
+
+if [ "$ret" -ne 0 ]; then
+  echo "Lite interpreter build failed!"
+  exit "$ret"
+fi
+
+
+# run test
+if ! build/bin/test_codegen_unboxing; then
+  echo "test_codegen_unboxing has failure!"
+  exit 1
+fi
+
+# shutdown test
+python "$TEST_SRC_ROOT/tests_setup.py" shutdown
+
+# run lite interpreter tests
+if ! build/bin/test_lite_interpreter_runtime; then
+  echo "test_lite_interpreter_runtime has failure!"
+  exit 1
+fi
+
+popd
+
+exit 0
--- a/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
+++ b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
@ -0,0 +1,195 @@
+#include <gtest/gtest.h>
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+// Cover codegen'd unboxing logic for these types:
+//'Device',
+//'Device?',
+//'Dimname',
+//'Dimname[1]',
+//'Dimname[]',
+//'Dimname[]?',
+//'Generator?',
+//'Layout?',
+//'MemoryFormat',
+//'MemoryFormat?',
+//'Scalar',
+//'Scalar?',
+//'ScalarType',
+//'ScalarType?',
+//'Scalar[]',
+//'Storage',
+//'Stream',
+//'Tensor',
+//'Tensor(a!)',
+//'Tensor(a!)[]',
+//'Tensor(a)',
+//'Tensor(b!)',
+//'Tensor(c!)',
+//'Tensor(d!)',
+//'Tensor?',
+//'Tensor?[]',
+//'Tensor[]',
+//'bool',
+//'bool?',
+//'bool[2]',
+//'bool[3]',
+//'bool[4]',
+//'float',
+//'float?',
+//'float[]?',
+//'int',
+//'int?',
+//'int[1]',
+//'int[1]?',
+//'int[2]',
+//'int[2]?',
+//'int[3]',
+//'int[4]',
+//'int[5]',
+//'int[6]',
+//'int[]',
+//'int[]?',
+//'str',
+//'str?'
+namespace torch {
+namespace jit {
+namespace mobile {
+// covers int[], ScalarType?, Layout?, Device?, bool?
+TEST(LiteInterpreterTest, Ones) {
+  // Load check in model: ones.ptl
+  auto testModelFile = "ones.ptl";
+
+  //  class Model(torch.nn.Module):
+  //    def forward(self, x: int):
+  //        a = torch.ones([3, x], dtype=torch.int64, layout=torch.strided, device="cpu")
+  //        return a
+  Module bc = _load_for_mobile(testModelFile);
+  std::vector<c10::IValue> input{c10::IValue(4)};
+  const auto result = bc.forward(input);
+  ASSERT_EQ(result.toTensor().size(0), 3);
+  ASSERT_EQ(result.toTensor().size(1), 4);
+}
+
+TEST(LiteInterpreterTest, Index) {
+  // Load check in model: index.ptl
+  auto testModelFile = "index.ptl";
+
+  //    class Model(torch.nn.Module):
+  //      def forward(self, index):
+  //        a = torch.zeros(2, 2)
+  //        a[0][1] = 1
+  //        a[1][0] = 2
+  //        a[1][1] = 3
+  //        return a[index]
+  Module bc = _load_for_mobile(testModelFile);
+  int64_t ind_1 = 0;
+
+  const auto result_1 = bc.forward({at::tensor(ind_1)});
+
+  at::Tensor expected = at::empty({1, 2}, c10::TensorOptions(c10::ScalarType::Float));
+  expected[0][0] = 0;
+  expected[0][1] = 1;
+
+  AT_ASSERT(result_1.toTensor().equal(expected));
+}
+
+TEST(LiteInterpreterTest, Gradient) {
+  // Load check in model: gradient.ptl
+  auto testModelFile = "gradient.ptl";
+
+  //    class Model(torch.nn.Module):
+  //      def forward(self, a: int):
+  //        values = torch.tensor([4., 1., 1., 16.], )
+  //        if a == 0:
+  //          return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64))
+  //        elif a == 1:
+  //          return torch.gradient(values, spacing=[torch.tensor(1.).item()])
+  Module bc = _load_for_mobile(testModelFile);
+
+  const auto result_1 = bc.forward({0});
+  at::Tensor expected_1 = at::tensor({-1.5, -0.75, 3.75, 7.5}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toList().get(0).toTensor().equal(expected_1));
+
+  const auto result_2 = bc.forward({1});
+  at::Tensor expected_2 = at::tensor({-3.0, -1.5, 7.5, 15.0}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_2.toList().get(0).toTensor().equal(expected_2));
+}
+
+TEST(LiteInterpreterTest, Upsample) {
+  // Load check in model: upsample.ptl
+  auto testModelFile = "upsample.ptl";
+
+  // model = torch.nn.Upsample(scale_factor=(2.0,), mode="linear")
+  Module bc = _load_for_mobile(testModelFile);
+
+  const auto result_1 = bc.forward({at::ones({1, 2, 3})});
+  at::Tensor expected_1 = at::ones({1, 2, 6}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toTensor().equal(expected_1));
+}
+
+TEST(LiteInterpreterTest, IndexTensor) {
+  // Load check in model: Index_Tensor.ptl
+  auto testModelFile = "index_Tensor.ptl";
+
+  // class Model(torch.nn.Module):
+  //   def forward(self, index):
+  //      values = torch.tensor([4., 1., 1., 16.], )
+  //      return values[[index, torch.tensor(0)]]
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::tensor({1}, c10::TensorOptions(c10::ScalarType::Long))});
+
+  at::Tensor expected_1 = at::tensor({1.}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toTensor().equal(expected_1));
+}
+
+TEST(LiteInterpreterTest, Conv2d) {
+  // Load check in model: conv2d.ptl
+  auto testModelFile = "conv2d.ptl";
+
+  // model = torch.nn.Conv2d(1, 2, (2, 2), stride=(1, 1), padding=(1, 1))
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::ones({1, 1, 1, 1})});
+
+  ASSERT_EQ(result_1.toTensor().sizes(), c10::IntArrayRef ({1,2,2,2}));
+}
+
+TEST(LiteInterpreterTest, AddTensor) {
+  // Load check in model: add_Tensor.ptl
+  auto testModelFile = "add_Tensor.ptl";
+
+  //  class Model(torch.nn.Module):
+  //    def forward(self, a):
+  //      values = torch.ones(size=[2, 3], names=['N', 'C'])
+  //      values[0][0] = a[0]
+  //      return values
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::tensor({1, 2, 3}, c10::TensorOptions(c10::ScalarType::Long))});
+
+  at::Tensor expected_1 = at::tensor({2, 3, 4}, c10::TensorOptions(c10::ScalarType::Long));
+  AT_ASSERT(result_1.toTensor().equal(expected_1));
+}
+
+TEST(LiteInterpreterTest, DivideTensor) {
+  // Load check in model: add_Tensor.ptl
+  auto testModelFile = "divide_Tensor.ptl";
+
+  //  class Model(torch.nn.Module):
+  //    def forward(self, b):
+  //      a = torch.tensor(3, dtype=torch.int64)
+  //      out = torch.empty(size=[1], dtype=torch.float)
+  //      torch.div(b, a, out=out)
+  //      return [torch.div(b, a, rounding_mode='trunc'), out]
+  Module bc = _load_for_mobile(testModelFile);
+  const auto result_1 = bc.forward({at::tensor({-12}, c10::TensorOptions(c10::ScalarType::Long))});
+
+  at::Tensor expected_1 = at::tensor({-4}, c10::TensorOptions(c10::ScalarType::Long));
+  at::Tensor expected_2 = at::tensor({-4.}, c10::TensorOptions(c10::ScalarType::Float));
+  AT_ASSERT(result_1.toList().get(0).toTensor().equal(expected_1));
+  AT_ASSERT(result_1.toList().get(1).toTensor().equal(expected_2));
+}
+} // namespace mobile
+} // namespace jit
+} // namespace torch
--- a/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp
+++ b/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp
@ -0,0 +1,18 @@
+#include <gtest/gtest.h>
+
+std::string add_negative_flag(const std::string& flag) {
+  std::string filter = ::testing::GTEST_FLAG(filter);
+  if (filter.find('-') == std::string::npos) {
+    filter.push_back('-');
+  } else {
+    filter.push_back(':');
+  }
+  filter += flag;
+  return filter;
+}
+int main(int argc, char* argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    ::testing::GTEST_FLAG(filter) = add_negative_flag("*_CUDA:*_MultiCUDA");
+
+    return RUN_ALL_TESTS();
+}
--- a/test/mobile/lightweight_dispatch/tests_setup.py
+++ b/test/mobile/lightweight_dispatch/tests_setup.py
@ -0,0 +1,180 @@
+import os
+import sys
+
+import torch
+
+
+class Setup(object):
+    def setup(self):
+        raise NotImplementedError()
+
+    def shutdown(self):
+        raise NotImplementedError()
+
+
+class FileSetup(object):
+    path = None
+
+    def shutdown(self):
+        if os.path.exists(self.path):
+            os.remove(self.path)
+            pass
+
+
+class ModelWithDTypeDeviceLayoutPinMemory(FileSetup):
+    path = 'ones.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: int):
+                a = torch.ones(size=[3, x], dtype=torch.int64, layout=torch.strided, device="cpu", pin_memory=False)
+                return a
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+class ModelWithTensorOptional(FileSetup):
+    path = 'index.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, index):
+                a = torch.zeros(2, 2)
+                a[0][1] = 1
+                a[1][0] = 2
+                a[1][1] = 3
+                return a[index]
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+class ModelWithScalarList(FileSetup):
+    path = 'gradient.ptl'
+
+    def setup(self):
+
+        class Model(torch.nn.Module):
+            def forward(self, a: int):
+                values = torch.tensor([4., 1., 1., 16.], )
+                if a == 0:
+                    return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64))
+                elif a == 1:
+                    return torch.gradient(values, spacing=[torch.tensor(1.).item()])
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+class ModelWithFloatList(FileSetup):
+    path = 'upsample.ptl'
+
+    def setup(self):
+        model = torch.nn.Upsample(scale_factor=(2.0,), mode="linear", align_corners=False, recompute_scale_factor=True)
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+class ModelWithListOfOptionalTensors(FileSetup):
+    path = 'index_Tensor.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, index):
+                values = torch.tensor([[4., 1., 1., 16.]])
+                return values[torch.tensor(0), index]
+
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1,
+# int groups=1) -> Tensor
+class ModelWithArrayOfInt(FileSetup):
+    path = 'conv2d.ptl'
+
+    def setup(self):
+        model = torch.nn.Conv2d(1, 2, (2, 2), stride=(1, 1), padding=(1, 1))
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+# add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+# ones_like(Tensor self, *, ScalarType?, dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None,
+# MemoryFormat? memory_format=None) -> Tensor
+class ModelWithTensors(FileSetup):
+    path = 'add_Tensor.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, a):
+                b = torch.ones_like(a)
+                return a + b
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+class ModelWithStringOptional(FileSetup):
+    path = 'divide_Tensor.ptl'
+
+    def setup(self):
+        class Model(torch.nn.Module):
+            def forward(self, b):
+                a = torch.tensor(3, dtype=torch.int64)
+                out = torch.empty(size=[1], dtype=torch.float)
+                torch.div(b, a, out=out)
+                return [torch.div(b, a, rounding_mode='trunc'), out]
+        model = Model()
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        script_model._save_for_lite_interpreter(self.path)
+
+
+tests = [
+    ModelWithDTypeDeviceLayoutPinMemory(),
+    ModelWithTensorOptional(),
+    ModelWithScalarList(),
+    ModelWithFloatList(),
+    ModelWithListOfOptionalTensors(),
+    ModelWithArrayOfInt(),
+    ModelWithTensors(),
+    ModelWithStringOptional(),
+]
+
+
+def setup():
+    for test in tests:
+        test.setup()
+
+
+def shutdown():
+    for test in tests:
+        test.shutdown()
+
+
+if __name__ == "__main__":
+    command = sys.argv[1]
+    if command == "setup":
+        setup()
+    elif command == "shutdown":
+        shutdown()
--- a/tools/codegen/api/translate.py
+++ b/tools/codegen/api/translate.py
@ -209,7 +209,6 @@ Check this module for more information.
                return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})"
            except UnsatError:
                return memory_format
-
        elif goal == NamedCType("options", BaseCType(tensorOptionsT)):
            dtype = direct_solve(NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))))
            pin_memory = direct_solve(NamedCType("pin_memory", OptionalCType(BaseCType(boolT))))
--- a/tools/codegen/api/unboxing.py
+++ b/tools/codegen/api/unboxing.py
@ -0,0 +1,208 @@
+from typing import List, Tuple
+
+from tools.codegen.api import cpp
+from tools.codegen.api.types import Binding, CType, CppSignatureGroup
+from tools.codegen.model import (
+    Argument,
+    NativeFunction,
+    Type,
+    BaseType,
+    OptionalType,
+    ListType,
+    BaseTy,
+)
+
+# This file generates the code for unboxing wrappers, i.e., the glue logic to unbox a boxed operator and convert the
+# ivalues from stack to correct arguments to the unboxed kernel, based on corresponding JIT schema. This codegen is
+# an alternative way to generate unboxing wrappers similar to the existing C++ metaprogramming approach but gets the
+# job done statically. These generated unboxing wrappers will be useful under the scenario where we need to register
+# a fixed set of operators known at compile time and thus can save some time in runtime initialization phase.
+#
+# Here's an example on how the codegen works:
+#
+# - Function Schema (source of truth)
+#
+#      aten::empty.names(int[] size, *, Dimname[]? names,
+#                        ScalarType? dtype=None, Layout? layout=None,
+#                        Device? device=None, bool? pin_memory=None,
+#                        MemoryFormat? memory_format=None) -> Tensor
+# - Argument Conversion
+#       Generates C++ code to convert an ivalue (from stack) to its underlying C++ type.
+#    - int[] size
+#        ```cpp
+#           const c10::List<c10::IValue> size_list_in = (std::move(peek(stack, 0, 7))).toList();
+#
+#           std::vector<int64_t> size_vec;
+#           for (c10::IValue size_elem: size_list_in) {
+#               int64_t size_base = size_elem.to<int64_t>();
+#               size_vec.push_back(size_base);
+#           }
+#           at::ArrayRef<int64_t> size_list_out(size_vec);
+#                                 ~~~~~~~~~~~~~ <-- The converted argument from ivalues in the stack.
+#                                                   Will be passed to unboxed kernel.
+#       ```
+#    - Dimname[]? names
+#       ```cpp
+#           c10::optional<c10::IValue> names_opt = (std::move(peek(stack, 1, 7))).toOptional<c10::IValue>();
+#           c10::optional<at::ArrayRef<at::Dimname>> names_opt_out;
+#           if (names_opt.has_value()) {
+#                         ~~~~~~~~~~~ <-- Unwrapping optional shell
+#               const c10::IValue names_opt_in = names_opt.value();
+#               const c10::List<c10::IValue> names_list_in = names_opt_in.toList();
+#
+#               std::vector<at::Dimname> names_vec;
+#               for (c10::IValue names_elem: names_list_in) {
+#                                ~~~~~~~~~~~~~~~~~~~~~~~~~ <-- Unrolling list, then convert elements one by one.
+#                   at::Dimname names_base = names_elem.to<at::Dimname>();
+#                   names_vec.push_back(names_base);
+#               }
+#               at::ArrayRef<at::Dimname> names_list_out(names_vec);
+#
+#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>(names_list_out);
+#           } else {
+#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>();
+#           }
+#       ```
+#    - ScalarType? dtype (similarly for the rest of the arguments)
+#       ```cpp
+#           c10::optional<c10::IValue> dtype_opt = (std::move(peek(stack, 2, 7))).toOptional<c10::IValue>();
+#           c10::optional<at::ScalarType> dtype_opt_out;
+#           if (dtype_opt.has_value()) {
+#               const c10::IValue dtype_opt_in = dtype_opt.value();
+#               at::ScalarType dtype_base = dtype_opt_in.to<at::ScalarType>();
+#                                                        ~~~~~~~~~~~~~~~~~~~~ <-- For base types, convert ivalue to it
+#                                                                                 directly using ".to<T>()" API.
+#               dtype_opt_out = c10::optional<at::ScalarType>(dtype_base);
+#           } else {
+#               dtype_opt_out = c10::optional<at::ScalarType>();
+#           }
+#       ```
+#
+# - Unboxed Kernel Call
+#   ```cpp
+#       auto result_ = torch::empty(
+#           size_list_out,
+#           names_opt_out,
+#           options,
+#           memory_format_opt_out
+#       );
+#   ```
+#
+# - Push Result Back to Stack
+#   ```cpp
+#       drop(stack, 7);
+#       pack(stack, std::move(result_));
+#   ```
+connector = "\n\t"
+
+
+# Return unboxing function name for a NativeFunction
+def name(f: NativeFunction) -> str:
+    return f.func.name.unambiguous_name()
+
+
+# Convert all the arguments in a NativeFunction to C++ code
+def convert_arguments(f: NativeFunction) -> Tuple[List[Binding], List[str]]:
+    # we need the 'self' argument so method needs to be False
+    args = CppSignatureGroup.from_native_function(f, method=False).most_faithful_signature().arguments()
+    code_list = [f"c10::IValue {args[i].name} = std::move(peek(stack, {i}, {len(args)}));" for i in
+                 range(len(args))] + [""]
+    binding_list = []
+    for i, arg in enumerate(args):
+        # expecting only Argument
+        if not isinstance(arg.argument, Argument):
+            raise Exception(f"Unexpected argument type, expecting `Argument` but got {arg}")
+        argument: Argument = arg.argument
+        unboxed_name, _, code, decl = argumenttype_ivalue_convert(
+            argument.type, argument.name, mutable=argument.is_write
+        )
+        code_list.extend(decl)
+        code_list.extend(code)
+        binding_list.append(arg.with_name(unboxed_name))
+    return binding_list, code_list
+
+
+# Takes in the type, name and mutability corresponding to an argument, and generates a tuple of:
+# (1) the C++ code necessary to unbox the argument
+# (2) A Binding corresponding to the newly created unboxed variable, including variable name and its CType
+def argumenttype_ivalue_convert(t: Type, arg_name: str, *, mutable: bool = False) -> Tuple[str, CType, List[str], List[str]]:
+    ctype = cpp.argumenttype_type(t=t, mutable=mutable, binds=arg_name).type
+
+    if isinstance(t, BaseType):
+        out_name = f"{arg_name}_base"
+        code, decl = _gen_code_base_type(arg_name=arg_name, out_name=out_name, ctype=ctype)
+    elif isinstance(t, OptionalType):
+        out_name = f"{arg_name}_opt_out"
+        code, decl = _gen_code_optional_type(arg_name=arg_name, out_name=out_name, t=t, ctype=ctype)
+    elif isinstance(t, ListType):
+        out_name = f"{arg_name}_list_out"
+        code, decl = _gen_code_list_type(arg_name=arg_name, out_name=out_name, t=t, ctype=ctype)
+    else:
+        raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")
+    return out_name, ctype, code, decl
+
+
+def _gen_code_base_type(arg_name: str, out_name: str, ctype: CType) -> Tuple[List[str], List[str]]:
+    return [f"{ctype.cpp_type(strip_ref=True)} {out_name} = {arg_name}.to<{ctype.cpp_type(strip_ref=True)}>();"], []
+
+
+def _gen_code_optional_type(arg_name: str, out_name: str, t: OptionalType, ctype: CType) -> Tuple[List[str], List[str]]:
+    in_name = f"{arg_name}_opt_in"
+    res_name, _, res_code, decl = argumenttype_ivalue_convert(t.elem, in_name)
+    return f"""
+c10::optional<c10::IValue> {arg_name}_opt = {arg_name}.toOptional<c10::IValue>();
+{ctype.cpp_type(strip_ref=True)} {out_name};
+if ({arg_name}_opt.has_value()) {{
+    const c10::IValue {in_name} = {arg_name}_opt.value();
+    {connector.join(res_code)}
+    {out_name} = {ctype.cpp_type(strip_ref=True)}({res_name});
+}} else {{
+    {out_name} = {ctype.cpp_type(strip_ref=True)}();
+}}
+        """.split("\n"), decl
+
+
+def _gen_code_list_type(arg_name: str, out_name: str, t: ListType, ctype: CType) -> Tuple[List[str], List[str]]:
+    in_name = f"{arg_name}_list_in"
+    elem_name = f"{arg_name}_elem"
+    code = [f"const c10::List<c10::IValue> {in_name} = {arg_name}.toList();"]
+    res_name, res_ctype, res_code, decl = argumenttype_ivalue_convert(t.elem, elem_name)
+    # handle list type with size, e.g., bool[4]
+    if isinstance(t.elem, BaseType) and t.elem.name == BaseTy.bool and t.size:
+        code.extend(
+            f"""
+{ctype.cpp_type(strip_ref=True)} {out_name} = as_array<{res_ctype.cpp_type(strip_ref=True)}, {t.size}>({in_name});
+            """.split(
+                "\n"
+            )
+        )
+    # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<c10::optional<at::Tensor>>
+    elif isinstance(t.elem, OptionalType):
+        code.extend(
+            f"""
+{ctype.cpp_type(strip_ref=True)} {out_name};
+for (c10::IValue {elem_name}: {in_name}) {{
+    {connector.join(res_code)}
+    {out_name}.push_back({res_name});
+}}
+            """.split(
+                "\n"
+            )
+        )
+    else:
+        # use ArrayRef as default.
+        vec_name = arg_name + "_vec"
+        # need to bring vector instantiation out of scope so that ArrayRef has valid data
+        decl.append(f"std::vector<{res_ctype.cpp_type(strip_ref=True)}> {vec_name};")
+        code.extend(
+            f"""
+for (c10::IValue {elem_name}: {in_name}) {{
+    {connector.join(res_code)}
+    {vec_name}.push_back({res_name});
+}}
+{ctype.cpp_type(strip_ref=True)} {out_name}({vec_name});
+            """.split(
+                "\n"
+            )
+        )
+    return code, decl
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@ -28,7 +28,7 @@ import tools.codegen.api.structured as structured
 from tools.codegen.api.translate import translate
 from tools.codegen.selective_build.selector import SelectiveBuilder
 from tools.codegen.utils import (
-    Target, concatMap, context, mapMaybe, YamlDumper, YamlLoader, FileManager, assert_never
+    Target, concatMap, context, mapMaybe, YamlDumper, YamlLoader, FileManager, assert_never, make_file_manager
 )
 from tools.codegen.context import (method_with_native_function,
                                   native_function_manager,
@ -250,8 +250,8 @@ def static_dispatch_extra_headers(backend: Optional[BackendIndex], skip_tensor_i


 def static_dispatch(
-    f: NativeFunction, cpp_sig: CppSignature,
-    *, method: bool, backend_index: Optional[BackendIndex]
+        f: NativeFunction, cpp_sig: CppSignature,
+        *, method: bool, backend_index: Optional[BackendIndex]
 ) -> Optional[str]:
    if backend_index is None or f.manual_kernel_registration:
        return None
@ -369,7 +369,7 @@ static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed
            assert_never(self.target)


-# Generates Function.h, which provides the functional public C++ API,
+# Generates Functions.h, which provides the functional public C++ API,
 # and the scaffolding to call into the dispatcher from these functions.
@dataclass(frozen=True)
 class ComputeFunction:
@ -952,7 +952,8 @@ def compute_registration_declarations(f: NativeFunction, backend_indices: Dict[D
    comment_data : Dict[str, str] = {
        'schema': f'aten::{f.func}',
        # TODO: What exactly is the semantics of the 'dispatch' field?
-        'dispatch': str({k for k, v in backend_indices.items() if v.has_kernel(f)} != {DispatchKey.CompositeImplicitAutograd}),
+        'dispatch': str(
+            {k for k, v in backend_indices.items() if v.has_kernel(f)} != {DispatchKey.CompositeImplicitAutograd}),
        'default': str(f.has_composite_kernel or has_autogenerated_composite_kernel(f))
    }
    return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)}
@ -1350,7 +1351,6 @@ def gen_source_files(
        native_functions: Sequence[NativeFunction],
        grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
        structured_native_functions: Sequence[NativeFunctionsGroup],
-        static_dispatch_idx: Optional[BackendIndex],
        selector: SelectiveBuilder,
        backend_indices: Dict[DispatchKey, BackendIndex],
        core_fm: FileManager,
@ -1362,6 +1362,7 @@ def gen_source_files(
        rocm: bool,
        force_schema_registration: bool,
        per_operator_headers: bool,
+        skip_dispatcher_op_registration: bool,
 ) -> None:
    extra_cuda_headers = '''\
 #include <c10/cuda/CUDAGuard.h>
@ -1446,7 +1447,7 @@ def gen_source_files(
                    class_method_name=None),
                grouped_native_functions
            )),
-            'dispatch_registrations': list(concatMap(
+            'dispatch_registrations': [] if skip_dispatcher_op_registration else list(concatMap(
                dest.RegisterDispatchKey(
                    backend_index,
                    Target.REGISTRATION,
@ -1507,7 +1508,8 @@ def gen_source_files(
    if force_schema_registration:
        schema_selector = SelectiveBuilder.get_nop_selector()
    cpu_fm.write('RegisterSchema.cpp', lambda: {
-        'schema_registrations': list(mapMaybe(RegisterSchema(schema_selector), native_functions)),
+        'schema_registrations': [] if skip_dispatcher_op_registration
+        else list(mapMaybe(RegisterSchema(schema_selector), native_functions)),
    })

    def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str:
@ -1630,6 +1632,10 @@ def main() -> None:
    parser.add_argument(
        '--static_dispatch_backend',
        help='generate static dispatch code for the specific backend (if set)')
+    parser.add_argument(
+        '--skip_dispatcher_op_registration',
+        action='store_true',
+        help='Avoid registering operators into the dispatcher.')
    parser.add_argument(
        '--force_schema_registration',
        action='store_true',
@ -1656,8 +1662,6 @@ def main() -> None:
    structured_native_functions = [g for g in grouped_native_functions
                                   if isinstance(g, NativeFunctionsGroup)]

-    template_dir = os.path.join(options.source_path, "templates")
-
    # NB: It is mandatory to NOT use os.path.join here, as the install directory
    # will eventually be ingested by cmake, which does not respect Windows style
    # path slashes.  If you switch this to use os.path.join, you'll get an error
@ -1673,18 +1677,11 @@ def main() -> None:
    ops_install_dir = f'{options.install_dir}/ops'
    pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True)

-    def make_file_manager(install_dir: str) -> FileManager:
-        return FileManager(
-            install_dir=install_dir,
-            template_dir=template_dir,
-            dry_run=options.dry_run
-        )
-
-    core_fm = make_file_manager(core_install_dir)
-    cpu_fm = make_file_manager(options.install_dir)
-    cpu_vec_fm = make_file_manager(options.install_dir)
-    cuda_fm = make_file_manager(options.install_dir)
-    ops_fm = make_file_manager(ops_install_dir)
+    core_fm = make_file_manager(options=options, install_dir=core_install_dir)
+    cpu_fm = make_file_manager(options=options)
+    cpu_vec_fm = make_file_manager(options=options)
+    cuda_fm = make_file_manager(options=options)
+    ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)

    extra_cuda_headers = '''\
 #include <c10/cuda/CUDAGuard.h>
@ -1721,7 +1718,6 @@ def main() -> None:
            native_functions=native_functions,
            grouped_native_functions=grouped_native_functions,
            structured_native_functions=structured_native_functions,
-            static_dispatch_idx=static_dispatch_idx,
            selector=selector,
            backend_indices=backend_indices,
            core_fm=core_fm,
@ -1733,6 +1729,7 @@ def main() -> None:
            rocm=options.rocm,
            force_schema_registration=options.force_schema_registration,
            per_operator_headers=options.per_operator_headers,
+            skip_dispatcher_op_registration=options.skip_dispatcher_op_registration,
        )

    if 'headers' in options.generate:
--- a/tools/codegen/utils.py
+++ b/tools/codegen/utils.py
@ -4,6 +4,7 @@ import hashlib
 import os
 import re
 import textwrap
+from argparse import Namespace
 from typing import Tuple, List, Iterable, Iterator, Callable, Sequence, TypeVar, Optional, Dict, Any, Union, Set, NoReturn
 from enum import Enum

@ -235,3 +236,10 @@ class FileManager:
        content = 'set({}\n    {})'.format(
            variable_name, '\n    '.join('"' + name + '"' for name in sorted(self.filenames)))
        self._write_if_changed(filename, content)
+
+
+# Helper function to generate file manager
+def make_file_manager(options: Namespace, install_dir: Optional[str] = None) -> FileManager:
+    template_dir = os.path.join(options.source_path, "templates")
+    install_dir = install_dir if install_dir else options.install_dir
+    return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run)
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@ -0,0 +1,182 @@
+# Generates RegisterCodegenUnboxedKernels.cpp, UnboxingFunctions.h and UnboxingFunctions.cpp.
+import argparse
+import os
+import pathlib
+from dataclasses import dataclass
+from tools.codegen.api import unboxing
+from tools.codegen.api.translate import translate
+from tools.codegen.api.types import CppSignatureGroup
+from tools.codegen.api.unboxing import convert_arguments
+from tools.codegen.context import method_with_native_function
+from tools.codegen.gen import parse_native_yaml, cpp_string
+from tools.codegen.model import NativeFunction, NativeFunctionsGroup, Variant
+from tools.codegen.utils import Target, FileManager, mapMaybe, make_file_manager
+from typing import Union, Sequence
+from typing_extensions import Literal
+
+
+# Generates UnboxingFunctions.h & UnboxingFunctions.cpp.
+@dataclass(frozen=True)
+class ComputeUnboxingFunctions:
+    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str:
+
+        if self.target is Target.DECLARATION:
+            # Note [The ATen Codegen Unboxing API]
+            # Similar to the ATen Operators API, ATen Codegen Unboxing API lives in the at::unboxing namespace, and
+            # will be used by codegen unboxing wrappers (CodegenUnboxingWrappers.cpp).
+            # The Wrappers will be registered into torch::jit::OperatorRegistry using RegisterOperators API.
+            #
+            # Important characteristics about the Codegen Unboxing API:
+            # (1) It follows the OperatorRegistry API.
+            #     This is kind of necessary to avoid overhead.
+            #     For example: if it followed the C++ API, then all of the faithful C++ factory functions
+            #     would need to wrap their arguments into TensorOptions only to unwrap them again.
+            # (2) Under the hood it calls C++ API.
+            return f"""
+// aten::{f.func}
+TORCH_API void {f.func.name.unambiguous_name()}(Stack & stack);
+"""
+        else:
+            sig_group = CppSignatureGroup.from_native_function(
+                f, method=(Variant.method in f.variants)
+            )
+            sig = sig_group.most_faithful_signature()
+            # parse arguments into C++ code
+            binding_list, code_list = convert_arguments(f)
+
+            # for each C++ argument, generate the conversion code
+            code_connector = "\n\t"
+            arg_connector = ", "
+            # function call and push back to stack
+            prefix = "self_base." if sig.method else "at::"
+            translated_args = translate(binding_list, sig.arguments(), method=sig.method)
+            args_str = f"{arg_connector.join(e.expr for e in translated_args)}"
+            if len(f.func.returns) == 0:
+                ret_str = ""
+                push_str = ""
+            else:
+                ret_str = "auto result_ = "
+                push_str = """
+    pack(stack, std::move(result_));
+                """
+            return f"""
+// aten::{f.func}
+TORCH_API void {f.func.name.unambiguous_name()}(Stack & stack) {{
+    {code_connector.join(code_list)}
+
+    drop(stack, {len(binding_list)});
+
+    {ret_str}{prefix}{sig.name()}({args_str});
+    {push_str}
+}}
+"""
+
+
+# Generates RegisterCodegenUnboxedKernels.cpp.
+@dataclass(frozen=True)
+class ComputeCodegenUnboxedKernels:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str:
+        # We unconditionally generate function wrappers,
+        sig_group = CppSignatureGroup.from_native_function(
+            f, method=(Variant.method in f.variants)
+        )
+
+        sig = sig_group.most_faithful_signature()
+
+        # escape double quote in schema, get rid of extra double quotes
+        schema = cpp_string(str(sig.func))[1:-1]
+
+        return f"""
+OperatorGenerator(
+    TORCH_SELECTIVE_SCHEMA("aten::{schema}"),
+    [](Stack & stack) {{
+        RECORD_FUNCTION("{sig.name()}", std::vector<c10::IValue>());
+        at::unboxing::{unboxing.name(f)}(stack);
+    }},
+    aliasAnalysisFromSchema()
+),
+"""
+
+
+def gen_unboxing(
+        *,
+        native_functions: Sequence[NativeFunction],
+        cpu_fm: FileManager,
+) -> None:
+    def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str:
+        return fn.root_name
+
+    cpu_fm.write_sharded(
+        "UnboxingFunctions.cpp",
+        native_functions,
+        key_fn=key_func,
+        env_callable=lambda fn: {
+            "definitions": [ComputeUnboxingFunctions(Target.DEFINITION)(fn)]
+        },
+        num_shards=5,
+        sharded_keys={"definitions"},
+    )
+    cpu_fm.write(
+        "UnboxingFunctions.h",
+        lambda: {
+            "declarations": list(
+                mapMaybe(ComputeUnboxingFunctions(Target.DECLARATION), native_functions)
+            ),
+        },
+    )
+    cpu_fm.write_sharded(
+        "RegisterCodegenUnboxedKernels.cpp",
+        native_functions,
+        key_fn=key_func,
+        env_callable=lambda fn: {"unboxed_ops": [ComputeCodegenUnboxedKernels()(fn)]},
+        num_shards=5,
+        sharded_keys={"unboxed_ops"},
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate unboxing source files")
+    parser.add_argument(
+        "-s",
+        "--source-path",
+        help="path to source directory for ATen",
+        default="aten/src/ATen",
+    )
+    parser.add_argument(
+        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+    )
+    parser.add_argument(
+        '-o',
+        '--output-dependencies',
+        help='output a list of dependencies into the given file and exit')
+    parser.add_argument(
+        '--dry-run', action='store_true',
+        help='run without writing any files (still updates outputs)')
+
+    options = parser.parse_args()
+
+    native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml")
+    parsed_yaml = parse_native_yaml(native_yaml_path)
+    native_functions, backend_indices = (
+        parsed_yaml.native_functions,
+        parsed_yaml.backend_indices,
+    )
+
+    cpu_fm = make_file_manager(options=options)
+    gen_unboxing(native_functions=native_functions, cpu_fm=cpu_fm)
+
+    if options.output_dependencies:
+        depfile_path = pathlib.Path(options.output_dependencies).resolve()
+        depfile_name = depfile_path.name
+        depfile_stem = depfile_path.stem
+
+        path = depfile_path.parent / depfile_name
+        cpu_fm.write_outputs(depfile_stem, str(path))
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@ -278,7 +278,8 @@ class CMake:
             'ONNX_NAMESPACE',
             'ATEN_THREADING',
             'WERROR',
-             'OPENSSL_ROOT_DIR')
+             'OPENSSL_ROOT_DIR',
+             'STATIC_DISPATCH_BACKEND')
        })

        # Aliases which are lower priority than their canonical option
@ -289,7 +290,6 @@ class CMake:
            'CMAKE_CUDA_COMPILER': 'CUDA_NVCC_EXECUTABLE',
            'CUDACXX': 'CUDA_NVCC_EXECUTABLE'
        }
-
        for var, val in my_env.items():
            # We currently pass over all environment variables that start with "BUILD_", "USE_", and "CMAKE_". This is
            # because we currently have no reliable way to get the list of all build options we have specified in
--- a/tools/setup_helpers/gen_unboxing.py
+++ b/tools/setup_helpers/gen_unboxing.py
@ -0,0 +1,11 @@
+# Little stub file to get BUILD.bazel to play along
+
+import os.path
+import sys
+
+root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, root)
+
+import tools.jit.gen_unboxing
+
+tools.jit.gen_unboxing.main()