Update on "[Pytorch][Bootcamp] Add fix and testing for vectorized Adadelta optimizer to handle complex numbers"

Made changes in the step function of the vectorized Adadelta optimizer to handle complex numbers as two real numbers as per 65711 on github. Differential Revision: [D31631870](https://our.internmc.facebook.com/intern/diff/D31631870/) [ghstack-poisoned]
2025-12-06 12:20:52 +01:00 · 2021-10-25 13:04:27 -07:00 · 2021-10-25 13:04:27 -07:00 · f2aa06c17a
commit f2aa06c17a
parent 6043d8e8a8 3c5c243d7d
115 changed files with 1785 additions and 372 deletions
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -10,7 +10,6 @@ CONFIG_TREE_DATA = [
                ]),
            ]),
            # TODO: bring back libtorch test
-            ("7", [X("3.6")]),
        ]),
        ("cuda", [
            ("10.2", [
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -6582,31 +6582,6 @@ workflows:
          name: pytorch_cpp_doc_push
          requires:
            - pytorch_cpp_doc_build
-      - pytorch_linux_build:
-          name: pytorch_linux_xenial_py3_6_gcc7_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc7"
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-linux-xenial-py3.6-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_py3_6_gcc7_test
-          requires:
-            - pytorch_linux_xenial_py3_6_gcc7_build
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          build_environment: "pytorch-linux-xenial-py3.6-gcc7-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
-          resource_class: large
      - pytorch_linux_build:
          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
          requires:
@ -8334,9 +8309,6 @@ workflows:
              only: /.*/
            tags:
              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-      - docker_build_job:
-          name: "docker-pytorch-linux-xenial-py3.6-gcc7"
-          image_name: "pytorch-linux-xenial-py3.6-gcc7"
    when: << pipeline.parameters.run_build >>
  master_build:
    jobs:
@ -8352,19 +8324,6 @@ workflows:
      - pytorch_cpp_doc_build:
          requires:
            - pytorch_linux_xenial_py3_6_gcc5_4_build
-      - pytorch_linux_build:
-          name: pytorch_linux_xenial_py3_6_gcc7_build
-          requires:
-            - "docker-pytorch-linux-xenial-py3.6-gcc7"
-          build_environment: "pytorch-linux-xenial-py3.6-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_py3_6_gcc7_test
-          requires:
-            - pytorch_linux_xenial_py3_6_gcc7_build
-          build_environment: "pytorch-linux-xenial-py3.6-gcc7-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7"
-          resource_class: large
      - pytorch_linux_build:
          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
          requires:
@ -8474,9 +8433,6 @@ workflows:
      - docker_build_job:
          name: "docker-pytorch-linux-xenial-py3.6-gcc5.4"
          image_name: "pytorch-linux-xenial-py3.6-gcc5.4"
-      - docker_build_job:
-          name: "docker-pytorch-linux-xenial-py3.6-gcc7"
-          image_name: "pytorch-linux-xenial-py3.6-gcc7"
    when: << pipeline.parameters.run_master_build >>
  ecr_gc:
    triggers:
--- a/.github/generated-ciflow-ruleset.json
+++ b/.github/generated-ciflow-ruleset.json
@ -17,6 +17,7 @@
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "parallelnative-linux-xenial-py3.6-gcc5.4",
      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
@ -36,6 +37,7 @@
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "parallelnative-linux-xenial-py3.6-gcc5.4",
      "win-vs2019-cpu-py3"
@ -62,6 +64,7 @@
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "win-vs2019-cpu-py3",
      "win-vs2019-cuda11.3-py3"
@ -87,6 +90,7 @@
      "linux-xenial-py3.6-clang7-asan",
      "linux-xenial-py3.6-clang7-onnx",
      "linux-xenial-py3.6-gcc5.4",
+      "linux-xenial-py3.6-gcc7",
      "linux-xenial-py3.6-gcc7-bazel-test",
      "parallelnative-linux-xenial-py3.6-gcc5.4",
      "periodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7",
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -274,6 +274,17 @@ LINUX_WORKFLOWS = [
            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
        ),
    ),
+    CIWorkflow(
+        arch="linux",
+        build_environment="linux-xenial-py3.6-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc7",
+        test_runner_type=LINUX_CPU_TEST_RUNNER,
+        num_test_shards=2,
+        ciflow_config=CIFlowConfig(
+            run_on_canary=True,
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}
+        ),
+    ),
    # ParallelTBB does not have a maintainer and is currently flaky
    # CIWorkflow(
    #    arch="linux",
--- a/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
+++ b/.github/workflows/generated-linux-xenial-py3.6-gcc7.yml
@ -0,0 +1,517 @@
+# @generated DO NOT EDIT MANUALLY
+# Template is at:    .github/templates/linux_ci_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-xenial-py3.6-gcc7
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, unassigned]
+  push:
+    branches:
+      - master
+      - release/*
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: linux-xenial-py3.6-gcc7
+  DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7
+  SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+  XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+  TORCH_CUDA_ARCH_LIST: 5.2
+  IN_CI: 1
+  IS_GHA: 1
+  # This is used for the phase of adding wheel tests only, will be removed once completed
+  IN_WHEEL_TEST: 1
+  # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh
+  CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  AWS_DEFAULT_REGION: us-east-1
+  CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+  CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+concurrency:
+  group: linux-xenial-py3.6-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+
+  ciflow_should_run:
+    runs-on: ubuntu-18.04
+    env:
+      IS_PROBOT_TRIGGER_EVENT: ${{ (github.event.action == 'unassigned') && (github.event.assigneed.login == 'pytorchbot') }}
+      LABEL_CONDITIONS: ${{ contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux') }}
+      LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+    if: ${{ (github.repository_owner == 'pytorch') && (
+            (github.event_name == 'push') ||
+            (github.event_name == 'schedule') ||
+            (contains(github.event.pull_request.labels.*.name, 'ciflow/all') || contains(github.event.pull_request.labels.*.name, 'ciflow/cpu') || contains(github.event.pull_request.labels.*.name, 'ciflow/default') || contains(github.event.pull_request.labels.*.name, 'ciflow/linux')) ||
+            ((github.event_name == 'pull_request' && github.event.action != 'unassigned') && !contains(join(github.event.pull_request.labels.*.name), 'ciflow/')))
+         }}
+    steps:
+      - name: noop
+        run: echo running ciflow_should_run
+      - name: print labels
+        run: echo "${LABELS}"
+
+  build:
+    runs-on: linux.2xlarge
+    needs: [ciflow_should_run]
+    env:
+      JOB_BASE_NAME: linux-xenial-py3.6-gcc7-build
+    outputs:
+      docker_image: ${{ steps.calculate-tag.outputs.docker_image }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        env:
+          AWS_RETRY_MODE: standard
+          AWS_MAX_ATTEMPTS: 5
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Calculate docker image tag
+        id: calculate-tag
+        run: |
+          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}"
+          echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}"
+          echo "::set-output name=docker_tag::${DOCKER_TAG}"
+          echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"
+      - name: Check if image should be built
+        id: check
+        env:
+          BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          set -x
+          # Check if image already exists, if it does then skip building it
+          if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then
+            exit 0
+          fi
+          if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then
+            # if we're on the base branch then use the parent commit
+            MERGE_BASE=$(git rev-parse HEAD~)
+          else
+            # otherwise we're on a PR, so use the most recent base commit
+            MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
+          fi
+          # Covers the case where a previous tag doesn't exist for the tree
+          # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
+          if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
+            echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+            exit 1
+          fi
+          PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+          # If no image exists but the hash is the same as the previous hash then we should error out here
+          if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+            echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+            echo "       contact the PyTorch team to restore the original images"
+            exit 1
+          fi
+          echo ::set-output name=rebuild::yes
+      - name: Build and push docker image
+        if: ${{ steps.check.outputs.rebuild }}
+        env:
+          DOCKER_SKIP_S3_UPLOAD: 1
+        working-directory: .circleci/docker
+        run: |
+          export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/}
+          ./build_docker.sh
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Build
+        env:
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            -e BUILD_ENVIRONMENT \
+            -e JOB_BASE_NAME \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e AWS_DEFAULT_REGION \
+            -e IS_GHA \
+            -e CIRCLE_PR_NUMBER \
+            -e CIRCLE_SHA1 \
+            -e CIRCLE_BRANCH \
+            -e GITHUB_RUN_ID \
+            -e SCCACHE_BUCKET \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e SKIP_SCCACHE_INITIALIZATION=1 \
+            -e TORCH_CUDA_ARCH_LIST \
+            -e PR_LABELS \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh'
+      - name: Display and upload binary build size statistics (Click Me)
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        run: |
+          COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0)
+          export COMMIT_TIME
+          pip3 install requests==2.26 boto3==1.16.34
+          python3 -m tools.stats.upload_binary_size_to_scuba || exit 0
+      - name: Chown workspace
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Archive artifacts into zip
+        run: |
+          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store PyTorch Build Artifacts on S3
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            artifacts.zip
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Clean up docker images
+        if: always()
+        run: |
+          # Prune all of the docker images
+          docker system prune -af
+
+  generate-test-matrix:
+    runs-on: ubuntu-18.04
+    needs: [ciflow_should_run]
+    env:
+      TEST_RUNNER_TYPE: linux.2xlarge
+      ENABLE_DISTRIBUTED_TEST: 1
+      ENABLE_JIT_LEGACY_TEST: ''
+      ENABLE_MULTIGPU_TEST: ''
+      ENABLE_NOGPU_NO_AVX_TEST: ''
+      ENABLE_NOGPU_NO_AVX2_TEST: ''
+      ENABLE_SLOW_TEST: ''
+      ENABLE_DOCS_TEST: ''
+      ENABLE_BACKWARDS_COMPAT_TEST: ''
+      ENABLE_XLA_TEST: ''
+      ENABLE_NOARCH_TEST: ''
+      NUM_TEST_SHARDS: 2
+      MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu
+      NOGPU_RUNNER_TYPE: linux.2xlarge
+      PR_BODY: ${{ github.event.pull_request.body }}
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      render-matrix: ${{ steps.set-matrix.outputs.render-matrix }}
+      ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Install dependencies
+        run: pip install typing-extensions==3.10
+      - name: Clone pytorch/pytorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+      - name: Generating test matrix
+        id: set-matrix
+        run: .github/scripts/generate_pytorch_test_matrix.py
+
+  test:
+    needs: [build, generate-test-matrix, ciflow_should_run]
+    strategy:
+      matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }}
+      JOB_BASE_NAME: linux-xenial-py3.6-gcc7-test
+      TEST_CONFIG: ${{ matrix.config }}
+      SHARD_NUMBER: ${{ matrix.shard }}
+      NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+      PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }}
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+      - name: Log in to ECR
+        env:
+          AWS_RETRY_MODE: standard
+          AWS_MAX_ATTEMPTS: 5
+        run: |
+          aws ecr get-login --no-include-email --region us-east-1 > /tmp/ecr-login.sh
+          bash /tmp/ecr-login.sh
+          rm /tmp/ecr-login.sh
+      - name: Chown workspace
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${ALPINE_IMAGE}"
+          # Ensure the working directory gets chowned back to the current user
+          docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Clean workspace
+        run: |
+          rm -rf "${GITHUB_WORKSPACE:?}/*"
+          rm -f ~/.ssh/authorized_keys
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: seemethere/add-github-ssh-key@v1
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
+      - name: Checkout PyTorch
+        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
+        with:
+          # deep clone, to allow use of git merge-base
+          fetch-depth: 0
+          submodules: recursive
+      - name: Pull Docker image
+        run: |
+          retry () {
+              "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
+          }
+          retry docker pull "${DOCKER_IMAGE}"
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }}
+        run: |
+          bash .github/scripts/install_nvidia_utils_linux.sh
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
+      - name: Determine shm-size
+        run: |
+          shm_size="1g"
+          case "${BUILD_ENVIRONMENT}" in
+            *cuda*)
+              shm_size="2g"
+              ;;
+            *rocm*)
+              shm_size="8g"
+              ;;
+          esac
+          echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}"
+      - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b
+        name: Download PyTorch Build Artifacts
+        with:
+          name: ${{ env.BUILD_ENVIRONMENT }}
+      - name: Unzip artifacts
+        run: |
+          unzip -o artifacts.zip
+      - name: Output disk space left
+        run: |
+          sudo df -H
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+      - name: Test
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+        # Time out the test phase after 240 minutes
+        timeout-minutes: 240
+        run: |
+          set -x
+
+          if [[ $TEST_CONFIG == 'multigpu' ]]; then
+            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
+            TEST_COMMAND=.jenkins/caffe2/test.sh
+          else
+            TEST_COMMAND=.jenkins/pytorch/test.sh
+          fi
+          # detached container should get cleaned up by teardown_ec2_linux
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \
+            -e GITHUB_ACTIONS \
+            -e IN_CI \
+            -e IS_GHA \
+            -e CIRCLE_BRANCH \
+            -e CIRCLE_SHA1 \
+            -e CIRCLE_PR_NUMBER \
+            -e AWS_DEFAULT_REGION \
+            -e IN_WHEEL_TEST \
+            -e SHARD_NUMBER \
+            -e JOB_BASE_NAME \
+            -e TEST_CONFIG \
+            -e NUM_TEST_SHARDS \
+            -e PYTORCH_IGNORE_DISABLED_ISSUES \
+            -e PR_LABELS \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e SCCACHE_BUCKET \
+            -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \
+            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --ulimit stack=10485760:83886080 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="${SHM_SIZE}" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}"
+      - name: Chown workspace
+        if: always()
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Install render_test_results dependencies
+        if: always()
+        shell: bash
+        run: |
+          python3 -m pip install junitparser==2.1.1 rich==10.9.0
+      - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]"
+        if: always()
+        shell: bash
+        # Encoding is weird on windows, just try to default to utf-8 if possible
+        env:
+          PYTHONIOENCODING: "utf-8"
+        run: |
+          python3 tools/render_junit.py test/
+      - name: Zip test reports for upload
+        if: always()
+        env:
+          FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}'
+        run: |
+          # Remove any previous test reports if they exist
+          rm -f test-reports-*.zip
+          zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
+      - uses: seemethere/upload-artifact-s3@v3
+        name: Store Test Reports on S3
+        if: always()
+        with:
+          retention-days: 14
+          if-no-files-found: error
+          path:
+            test-reports-*.zip
+      - name: Display and upload test statistics (Click Me)
+        if: always()
+        # temporary hack: set CIRCLE_* vars, until we update
+        # tools/stats/print_test_stats.py to natively support GitHub Actions
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+          CIRCLE_BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          JOB_BASE_NAME: linux-xenial-py3.6-gcc7-test
+          CIRCLE_PR_NUMBER: ${{ github.event.pull_request.number }}
+          CIRCLE_SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          CIRCLE_TAG: ${{ steps.parse-ref.outputs.tag }}
+          CIRCLE_WORKFLOW_ID: '${{ github.run_id }}_${{ github.run_number }}'
+        shell: bash
+        run: |
+          python3 -m pip install -r requirements.txt
+          python3 -m pip install boto3==1.16.34
+          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+      - name: Hold runner for 2 hours or until ssh sessions have drained
+        # Always hold for active ssh sessions
+        if: always()
+        run: .github/scripts/wait_for_ssh_to_drain.sh
+      - name: Chown workspace
+        if: always()
+        env:
+          ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+        run: |
+          # Ensure the working directory gets chowned back to the current user
+          docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
+      - name: Kill containers, clean up images
+        if: always()
+        run: |
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
+          # Prune all of the docker images
+          docker system prune -af
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -327,8 +327,10 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-      set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
+    if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+        set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
+      endif()
    endif()
    if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND
        CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.9 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -544,10 +544,11 @@ static void check_shape_forward(const at::Tensor& input,
    bool kernel_size_correct = true;

    TORCH_CHECK(input.size(1) == (weight_sizes[1] * groups),
-             "Given groups=", groups, ", weight of size ", weight_sizes,
-             ", expected input", input.sizes(), " to have ",
-             (weight_sizes[1] * groups), " channels, but got ", input.size(1),
-             " channels instead");
+                "Given groups=", groups, ", weight of size ", weight_sizes,
+                ", expected input", input.sizes(), " to have ",
+                (weight_sizes[1] * groups), " channels, but got ", input.size(1),
+                " channels instead");
+
    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
             "Given weight of size ", weight_sizes,
             ", expected bias to be 1-dimensional with ", weight_sizes[0], " elements",
@ -850,7 +851,7 @@ at::Tensor _convolution(

  check_shape_forward(input, weight_sizes, bias, params);

-  if (input.size(0) == 0) {
+  if (input.size(0) == 0 || input.size(1) == 0) {
    // don't send empty inputs through backends
    // but need to compute correct output size first and set up history for params
    std::vector<int64_t> o;
@ -862,6 +863,9 @@ at::Tensor _convolution(
                          params.output_padding, params.stride, params.dilation,
                          params.groups);
    }
+    if (input.size(1) == 0) {
+      o[input_channels_dim] = 0;
+    }
    if (input_is_mkldnn && weight.is_mkldnn()) {
      // mkldnn will error on the below 0-dim handling code
      return empty_mkldnn(
@ -871,10 +875,12 @@ at::Tensor _convolution(
          input.options().device_opt(),
          input.options().pinned_memory_opt());
    }
+
    auto weight_view = at::_unsafe_view(weight, -1);
-    auto out = input*weight_view[0];
-    if (bias.defined())
+    auto out = (input.size(1) == 0) ? (input.view(-1) * weight_view) : (input * weight_view[0]);
+    if (bias.defined()) {
      out.add_(bias[0]);
+    }
    return out.view(o);
  }

--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@ -52,20 +52,17 @@ static inline void slow_conv2d_shape_check(
  }

  const int64_t ndim = input.dim();
-  const int64_t dim_batch = 0;
  const int64_t dim_planes = 1;
  const int64_t dim_height = 2;
  const int64_t dim_width = 3;

-  // Allow for empty batch size but not other dimensions
-  bool valid_empty = ndim == 4 && input.size(dim_batch) == 0 &&
-      input.size(dim_planes) != 0 && input.size(dim_height) != 0 &&
-      input.size(dim_width) != 0;
-
-  TORCH_CHECK(
-      (input.numel() > 0 || valid_empty) && ndim == 4,
-      "non-empty 4D input tensor expected but got: ",
-      input.sizes());
+  // Allow for empty batch size and channel size but not other dimensions
+  TORCH_CHECK(ndim == 4, "Expected 4D input tensor, but got: ", input.sizes());
+  for (int64_t dim = 2; dim < ndim; ++dim) {
+    TORCH_CHECK(input.size(dim) != 0,
+                "Expected non-zero size for input dimension ", dim,
+                ", but got input shape: ", input.sizes(), ". Only the batch and channel dimensions support size 0.");
+  }

  const int64_t input_height = input.size(dim_height);
  const int64_t input_width = input.size(dim_width);
@ -109,7 +106,9 @@ static inline void slow_conv2d_shape_check(
    if (weight.dim() == 2) {
      n_input_plane /= (kernel_height * kernel_width);
    }
-    check_dim_size(input, ndim, dim_planes, n_input_plane);
+    if (input.size(1) != 0) {
+      check_dim_size(input, ndim, dim_planes, n_input_plane);
+    }
  }

  if (grad_output.defined()) {
@ -529,6 +528,7 @@ std::tuple<Tensor, Tensor> slow_conv2d_forward_cpu(
      padding,
      output,
      finput);
+
  return std::make_tuple(output, finput);
 }

@ -559,6 +559,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cpu(
    at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
  }

+
  if (grad_weight.defined()) {
    grad_weight.resize_(weight.sizes());
    grad_weight.zero_();
--- a/benchmarks/static_runtime/deep_wide_pt_bench.cc
+++ b/benchmarks/static_runtime/deep_wide_pt_bench.cc
@ -82,11 +82,11 @@ static void BM_deep_wide_static(benchmark::State& state) {
  auto user_emb = torch::randn({batch_size, 1, embedding_size});
  auto wide = torch::randn({batch_size, num_features});

-  std::vector<at::Tensor> inputs({ad_emb_packed, user_emb, wide});
+  std::vector<c10::IValue> inputs({ad_emb_packed, user_emb, wide});

-  smod(inputs);
+  smod(inputs, {});
  for (auto _ : state) {
-    smod(inputs);
+    smod(inputs, {});
  }
 }

@ -104,11 +104,11 @@ static void BM_deep_wide_static_threaded(benchmark::State& state) {
  auto user_emb = torch::randn({batch_size, 1, embedding_size});
  auto wide = torch::randn({batch_size, num_features});

-  std::vector<at::Tensor> inputs({ad_emb_packed, user_emb, wide});
+  std::vector<c10::IValue> inputs({ad_emb_packed, user_emb, wide});

-  sr(inputs);
+  sr(inputs, {});
  for (auto _ : state) {
-    sr(inputs);
+    sr(inputs, {});
  }
 }

@ -118,11 +118,11 @@ static void BM_leaky_relu_const(benchmark::State& state) {

  const int batch_size = state.range(0);
  auto data = torch::randn({batch_size, num_features});
-  std::vector<at::Tensor> inputs({data});
+  std::vector<c10::IValue> inputs({data});

-  smod(inputs);
+  smod(inputs, {});
  for (auto _ : state) {
-    smod(inputs);
+    smod(inputs, {});
  }
 }

@ -133,11 +133,11 @@ static void BM_leaky_relu(benchmark::State& state) {
  const int batch_size = state.range(0);
  auto neg_slope = torch::randn(1);
  auto data = torch::randn({batch_size, num_features});
-  std::vector<at::Tensor> inputs({data, neg_slope[0]});
+  std::vector<c10::IValue> inputs({data, neg_slope[0]});

-  smod(inputs);
+  smod(inputs, {});
  for (auto _ : state) {
-    smod(inputs);
+    smod(inputs, {});
  }
 }

@ -150,11 +150,11 @@ static void BM_signed_log1p(benchmark::State& state) {

  const int num_elements = state.range(0);
  auto data = torch::randn({num_elements});
-  std::vector<at::Tensor> inputs({data});
+  std::vector<c10::IValue> inputs({data});

-  smod(inputs);
+  smod(inputs, {});
  for (auto _ : state) {
-    smod(inputs);
+    smod(inputs, {});
  }
 }

@ -170,11 +170,11 @@ static void BM_long_static_memory_optimization(benchmark::State& state) {
  auto a = torch::randn({N, N});
  auto b = torch::randn({N, N});
  auto c = torch::randn({N, N});
-  std::vector<at::Tensor> inputs({a, b, c});
+  std::vector<c10::IValue> inputs({a, b, c});

-  smod(inputs);
+  smod(inputs, {});
  for (auto _ : state) {
-    smod(inputs);
+    smod(inputs, {});
  }
 }

--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@ -755,9 +755,9 @@ TEST(StaticRuntime, LongModel) {
  at::Tensor output_1 = mod.forward(input_ivalues).toTensor();

  // run static runtime
-  std::vector<at::Tensor> input_tensors({a, b, c});
+  std::vector<c10::IValue> input_tensors({a, b, c});
  torch::jit::StaticModule smod(mod);
-  at::Tensor output_2 = smod(input_tensors)[0];
+  at::Tensor output_2 = smod(input_tensors, {}).toTensor();
  smod.runtime().check_for_memory_leak();
  EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
 }
@ -773,9 +773,9 @@ TEST(StaticRuntime, TrivialModel) {
  at::Tensor output_1 = mod.forward(input_ivalues).toTensor();

  // run static runtime
-  std::vector<at::Tensor> input_tensors({a, b, c});
+  std::vector<c10::IValue> input_tensors({a, b, c});
  torch::jit::StaticModule smod(mod);
-  at::Tensor output_2 = smod(input_tensors)[0];
+  at::Tensor output_2 = smod(input_tensors, {}).toTensor();
  smod.runtime().check_for_memory_leak();
  EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
 }
@ -789,9 +789,9 @@ TEST(StaticRuntime, LeakyReLU) {
  at::Tensor output_1 = mod.forward(input_ivalues).toTensor();

  // run static runtime
-  std::vector<at::Tensor> input_tensors({inputs});
+  std::vector<c10::IValue> input_tensors({inputs});
  torch::jit::StaticModule smod(mod);
-  at::Tensor output_2 = smod(input_tensors)[0];
+  at::Tensor output_2 = smod(input_tensors, {}).toTensor();
  smod.runtime().check_for_memory_leak();
  EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
 }
@ -813,8 +813,10 @@ TEST(StaticRuntime, DeepWide) {
      auto output_1 = getTensor(mod.forward(inputs));

      // run static runtime
-      std::vector<at::Tensor> input_tensors({ad_emb_packed, user_emb, wide});
-      at::Tensor output_2 = smod(input_tensors)[0];
+      std::vector<c10::IValue> input_tensors({ad_emb_packed, user_emb, wide});
+      auto outputs = smod(input_tensors, {}).toTuple()->elements();
+      ASSERT_TRUE(outputs.size() > 0);
+      at::Tensor output_2 = outputs[0].toTensor();
      smod.runtime().check_for_memory_leak();
      EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
    }
@ -947,9 +949,11 @@ TEST(StaticRuntime, CleanUpMemory) {
              auto output_1 = getTensor(mod.forward(inputs));

              // run static runtime
-              std::vector<at::Tensor> input_tensors(
+              std::vector<c10::IValue> input_tensors(
                  {ad_emb_packed, user_emb, wide});
-              at::Tensor output_2 = runtime(input_tensors)[0];
+              auto outputs = runtime(input_tensors, {}).toTuple()->elements();
+              ASSERT_TRUE(outputs.size() > 0);
+              auto output_2 = outputs[0].toTensor();
              runtime.check_for_memory_leak();
              EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6));
              if (manage_output_tensors) {
@ -1053,9 +1057,9 @@ TEST(StaticRuntime, ManageOutputTensorsWithDeallocateOutputTensors) {
      torch::randn({batch_size, 1, embedding_size});
    auto user_emb = torch::randn({batch_size, 1, embedding_size});
    auto wide = torch::randn({batch_size, num_features});
-    std::vector<at::Tensor> input_tensors(
+    std::vector<c10::IValue> input_tensors(
        {ad_emb_packed, user_emb, wide});
-    runtime(input_tensors)[0];
+    runtime(input_tensors, {});
    runtime.check_for_memory_leak();
    runtime.deallocateOutputTensors();
    runtime.checkOutputTensorMemoryLeaks();
@ -1079,21 +1083,21 @@ TEST(StaticRuntime, ManageOutputTensorsWithoutDeallocateOutputTensors) {
    torch::randn({batch_size, 1, embedding_size});
  auto user_emb = torch::randn({batch_size, 1, embedding_size});
  auto wide = torch::randn({batch_size, num_features});
-  std::vector<at::Tensor> input_tensors(
+  std::vector<c10::IValue> input_tensors(
      {ad_emb_packed, user_emb, wide});
  // Profile run.
-  runtime(input_tensors)[0];
+  runtime(input_tensors, {});
  runtime.deallocateOutputTensors();
  // Run again to allocate output Tensors without deallocating them.
-  runtime(input_tensors)[0];
+  runtime(input_tensors, {});
  // Memory leak checking fails.
  EXPECT_THROW(runtime.checkOutputTensorMemoryLeaks(), std::exception);
  // Calling the runtime without deallocation fails too.
-  EXPECT_THROW(runtime(input_tensors)[0], std::exception);
+  EXPECT_THROW(runtime(input_tensors, {}), std::exception);
  // After deallocation, everything works fine.
  runtime.deallocateOutputTensors();
  runtime.checkOutputTensorMemoryLeaks();
-  runtime(input_tensors)[0];
+  runtime(input_tensors, {});
 }

 TEST(StaticRuntime, FusionPass) {
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@ -46,6 +46,7 @@
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/jit_trace.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/symbolic_script.h>
 #include <torch/csrc/jit/serialization/import.h>
@ -58,6 +59,8 @@
 #include <c10/util/Exception.h>
 #include <c10/util/ThreadLocalDebugInfo.h>

+#include <torch/csrc/jit/passes/freeze_module.h>
+#include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
 #include <algorithm>
 #include <cstddef>
 #include <functional>
@ -67,6 +70,7 @@
 #include <stdexcept>
 #include <string>
 #include <tuple>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@ -1803,6 +1807,46 @@ TEST(LoopPeelerTest, SimpleNestedLoops2) {
  }
 }

+TEST(JitTracing, Basic) {
+  constexpr int batch_size = 4;
+  constexpr int input_size = 256;
+
+  int hidden_size = 2 * input_size;
+
+  auto input = at::randn({batch_size, input_size}, at::kCPU);
+  auto hx = at::randn({batch_size, hidden_size}, at::kCPU);
+  auto cx = at::randn({batch_size, hidden_size}, at::kCPU);
+  auto w_ih = t_def(at::randn({4 * hidden_size, input_size}, at::kCPU));
+  auto w_hh = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCPU));
+
+  auto graph = build_lstm();
+  auto stack = createStack({input, hx, cx, w_ih, w_hh});
+  auto traced = TraceGraph(graph, stack);
+  Tensor prof_out;
+  pop(stack, prof_out);
+
+  {
+    stack = createStack({input, hx, cx, w_ih, w_hh});
+    Code cd(traced, "traced");
+    InterpreterState is{cd};
+    is.run(stack);
+    Tensor traced_out;
+    pop(stack, traced_out);
+    torch::allclose(prof_out, traced_out);
+  }
+
+  {
+    stack = createStack({input, hx, cx, w_ih, w_hh});
+    Code cd(graph, "graph");
+    InterpreterState is{cd};
+    is.run(stack);
+    Tensor scripted_out;
+    pop(stack, scripted_out);
+    torch::allclose(prof_out, scripted_out);
+  }
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 TEST(InsertAndEliminateRedundantGuardsTest, Basic) {
  static const auto basic_example = R"JIT(
  def basic(x, y):
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import io
 import os
 import sys
--- a/test/jit/test_aten_pow.py
+++ b/test/jit/test_aten_pow.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing._internal.common_utils import TestCase

--- a/test/jit/test_attr.py
+++ b/test/jit/test_attr.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from torch.testing import FileCheck
 from torch.testing._internal.jit_utils import JitTestCase
 import torch
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_backend_nnapi.py
+++ b/test/jit/test_backend_nnapi.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from torch.testing._internal.jit_utils import JitTestCase
 import io
 import os
--- a/test/jit/test_batch_mm.py
+++ b/test/jit/test_batch_mm.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing import FileCheck
 from torch.testing._internal.jit_utils import JitTestCase
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import inspect
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import io
 import os
 import sys
--- a/test/jit/test_complex.py
+++ b/test/jit/test_complex.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 import os
 import sys
--- a/test/jit/test_complexity.py
+++ b/test/jit/test_complexity.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_convert_activation.py
+++ b/test/jit/test_convert_activation.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_cuda.py
+++ b/test/jit/test_cuda.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import gc
--- a/test/jit/test_custom_operators.py
+++ b/test/jit/test_custom_operators.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_data_parallel.py
+++ b/test/jit/test_data_parallel.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_export_modes.py
+++ b/test/jit/test_export_modes.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import io
 import os
 import shutil
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import io
 import unittest
 from itertools import product
@ -1289,6 +1291,101 @@ class TestFreezing(JitTestCase):
        FileCheck().check('prim::GetAttr[name="a"]').run(fm.forward.graph)
        FileCheck().check('prim::GetAttr[name="b"]').run(fm.modify_a.graph)

+    def test_freeze_module_with_user_preserved_attribute_on_submodule(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.a = 1
+                self.b = 2
+
+            def forward(self):
+                return self.a + self.b
+
+        class Module(nn.Module):
+            def __init__(self):
+                super(Module, self).__init__()
+                self.sub1 = SubModule()
+                self.sub2 = SubModule()
+
+            def forward(self):
+                return self.sub1() + self.sub2()
+
+        m = torch.jit.script(Module())
+        m.eval()
+        m = torch.jit.freeze(m, preserved_attrs=['sub1.a', 'sub2.a'])
+        fm = m._c
+
+        self.assertTrue(fm.hasattr('sub1'))
+        self.assertTrue(fm.sub1.hasattr('a'))
+        self.assertFalse(fm.sub1.hasattr('b'))
+        self.assertTrue(fm.hasattr('sub2'))
+        self.assertTrue(fm.sub2.hasattr('a'))
+        self.assertFalse(fm.sub2.hasattr('b'))
+        self.assertEqual(m(), 6)
+        m.sub1.a += 1
+        self.assertEqual(m(), 7)
+
+    def test_freeze_module_with_user_preserved_attribute_on_unused_submodule(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.a = 1
+                self.b = 2
+
+            def forward(self):
+                return self.a + self.b
+
+            @torch.jit.export
+            def method_a(self):
+                return 42
+
+        class Module(nn.Module):
+            def __init__(self):
+                super(Module, self).__init__()
+                self.sub = SubModule()
+
+            def forward(self):
+                return 1
+
+        m = torch.jit.script(Module())
+        m.eval()
+        fm = torch.jit.freeze(m, preserved_attrs=['sub.a', 'sub.method_a'])._c
+
+        self.assertTrue(fm.hasattr('sub'))
+        self.assertTrue(fm.sub.hasattr('a'))
+        self.assertFalse(fm.sub.hasattr('b'))
+        self.assertTrue(fm.sub._has_method('method_a'))
+
+    def test_freeze_module_with_user_preserved_method_on_submodule(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+
+            def forward(self, x):
+                return self.method_a(x) + self.method_b(x)
+
+            def method_a(self, x):
+                return x * x
+
+            def method_b(self, x):
+                return x + x
+
+        class Module(nn.Module):
+            def __init__(self):
+                super(Module, self).__init__()
+                self.sub = SubModule()
+
+            def forward(self, x):
+                return self.sub(x)
+
+        m = torch.jit.script(Module())
+        m.eval()
+        fm = torch.jit.freeze(m, preserved_attrs=['sub.method_a'])._c
+
+        self.assertTrue(fm.hasattr('sub'))
+        self.assertTrue(fm.sub._has_method('method_a'))
+        self.assertFalse(fm.sub._has_method('method_b'))
+
    @skipIfNoFBGEMM
    def test_module_with_shared_type_instances(self):
        class Child(nn.Module):
--- a/test/jit/test_functional_blocks.py
+++ b/test/jit/test_functional_blocks.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_fuser_common.py
+++ b/test/jit/test_fuser_common.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing._internal.jit_utils import JitTestCase

--- a/test/jit/test_graph_rewrite_passes.py
+++ b/test/jit/test_graph_rewrite_passes.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from torch.testing._internal.jit_utils import JitTestCase
 import torch
 import torch._C
--- a/test/jit/test_hash.py
+++ b/test/jit/test_hash.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_hooks.py
+++ b/test/jit/test_hooks.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_hooks_modules.py
+++ b/test/jit/test_hooks_modules.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from typing import List, Tuple

--- a/test/jit/test_ignorable_args.py
+++ b/test/jit/test_ignorable_args.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import torch
--- a/test/jit/test_ignore_context_manager.py
+++ b/test/jit/test_ignore_context_manager.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_isinstance.py
+++ b/test/jit/test_isinstance.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_jit_utils.py
+++ b/test/jit/test_jit_utils.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 from textwrap import dedent
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import inspect
--- a/test/jit/test_logging.py
+++ b/test/jit/test_logging.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from typing import Any, Dict, List, Optional, Tuple

 from torch.testing._internal.jit_utils import JitTestCase, make_global
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import unittest
--- a/test/jit/test_module_apis.py
+++ b/test/jit/test_module_apis.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 import os
 import sys
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_module_interface.py
+++ b/test/jit/test_module_interface.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 # flake8: noqa
 # TODO: enable linting check for this file

--- a/test/jit/test_modules.py
+++ b/test/jit/test_modules.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 import os
 import sys
--- a/test/jit/test_onnx_export.py
+++ b/test/jit/test_onnx_export.py
@ -1,3 +1,5 @@
+# Owner(s): ["module: onnx"]
+
 import io
 import os
 import sys
--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: mobile"]
+
 import torch
 import torch._C
 import torch.backends.xnnpack
--- a/test/jit/test_parametrization.py
+++ b/test/jit/test_parametrization.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+

 import torch
 from torch import nn
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import torch
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA, _inline_everything
 from torch import nn
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_python_bindings.py
+++ b/test/jit/test_python_bindings.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing._internal.jit_utils import JitTestCase

--- a/test/jit/test_python_builtins.py
+++ b/test/jit/test_python_builtins.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import tempfile
--- a/test/jit/test_python_ir.py
+++ b/test/jit/test_python_ir.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing._internal.jit_utils import JitTestCase

--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import types
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from itertools import product as product
 from typing import NamedTuple, Optional
 import io
--- a/test/jit/test_script_profile.py
+++ b/test/jit/test_script_profile.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_scriptmod_ann.py
+++ b/test/jit/test_scriptmod_ann.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import warnings
--- a/test/jit/test_slice.py
+++ b/test/jit/test_slice.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_string_formatting.py
+++ b/test/jit/test_string_formatting.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import torch
 from torch.testing._internal.jit_utils import JitTestCase, execWrapper
 import operator
--- a/test/jit/test_tensor_creation_ops.py
+++ b/test/jit/test_tensor_creation_ops.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import io
 import os
 import sys
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import unittest
 import io
 import os
--- a/test/jit/test_type_sharing.py
+++ b/test/jit/test_type_sharing.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import io
--- a/test/jit/test_types.py
+++ b/test/jit/test_types.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from collections import namedtuple
 from typing import Dict, List, Optional, Tuple

--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_union.py
+++ b/test/jit/test_union.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import io
 import os
 import sys
--- a/test/jit/test_unsupported_ops.py
+++ b/test/jit/test_unsupported_ops.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/jit/test_warn.py
+++ b/test/jit/test_warn.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys
 import io
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import os
 import sys

--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@ -1481,6 +1481,7 @@ class TestNormalizeOperators(JitTestCase):
            "index_put",
            "nn.functional.conv2d",
            "nn.functional.dropout",
+            "nn.functional.embedding",  # Implemented with a lambda
            "polygamma",
            "special.polygamma",
            "repeat",
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# Owner(s): ["oncall: jit"]
+
 import torch

 # This is how we include tests located in test/jit/...
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import unittest
 import os
 import random
--- a/test/test_jit_disabled.py
+++ b/test/test_jit_disabled.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import sys
 import os
 import contextlib
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+# Owner(s): ["oncall: jit"]
+
 import unittest
 import os
 import sys
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import sys
 sys.argv.append("--jit_executor=legacy")
 from test_jit_fuser import *  # noqa: F403
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import sys
 sys.argv.append("--jit_executor=legacy")
 from test_jit import *  # noqa: F403
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import sys
 sys.argv.append("--jit_executor=profiling")
 from test_jit import *  # noqa: F403
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 import sys
 sys.argv.append("--jit_executor=simple")
 from test_jit import *  # noqa: F403
--- a/test/test_jit_string.py
+++ b/test/test_jit_string.py
@ -1,3 +1,5 @@
+# Owner(s): ["oncall: jit"]
+
 from test_jit import JitTestCase
 from torch.testing._internal.common_utils import run_tests

--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -13982,6 +13982,32 @@ class TestNNDeviceType(NNTestCase):
        self.assertEqual(mod.weight.grad, torch.tensor([0., 0, 0], device=device))
        self.assertEqual(mod.bias.grad, torch.tensor([0., 0, 0], device=device))

+    def test_conv_empty_channel(self, device):
+        in_channels = 0
+        mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2).to(device)
+        inp = torch.randn(2, 0, 15, device=device)
+        self._test_module_empty_input(mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 0, device=device)
+            mod(inp)
+
+        mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2).to(device)
+        inp = torch.randn(2, 0, 50, 100, device=device)
+        self._test_module_empty_input(mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 40, 0, device=device)
+            mod(inp)
+
+        mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2).to(device)
+        inp = torch.randn(2, 0, 50, 20, 40, device=device)
+        self._test_module_empty_input(mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 50, 0, 40, device=device)
+            mod(inp)
+
    def test_group_conv_empty(self, device):
        mod = torch.nn.Conv2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
        inp = torch.randn(0, 4, 4, 4, device=device)
--- a/test/test_optim.py
+++ b/test/test_optim.py
@ -247,8 +247,8 @@ class TestOptim(TestCase):

    def _test_complex_optimizer(self, optimizer_constructor):
        complex_param = torch.randn(5, 5, dtype=torch.complex64, requires_grad=True)
-        complex_opt = optimizer_constructor(complex_param)
        real_param = torch.view_as_real(complex_param).detach().clone().requires_grad_()
+        complex_opt = optimizer_constructor(complex_param)
        real_opt = optimizer_constructor(real_param)

        for i in range(3):
@ -652,11 +652,6 @@ class TestOptim(TestCase):
                    [param], lr=1e-1, initial_accumulator_value=0.1
                )
            )
-            self._test_complex_optimizer(
-                lambda param: optimizer(
-                    [param], lr=1e-1, initial_accumulator_value=0.1, weight_decay=1
-                )
-            )

    def test_adamax(self):
        for optimizer in [optim.Adamax, optim_mt.Adamax]:
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@ -17,7 +17,7 @@ class StaticModule:

    def __call__(self, *args, **kwargs):
        if not kwargs:
-            return self.static_module(args)
+            return self.static_module(args, {})
        else:
            return self.static_module(args, kwargs)

@ -227,20 +227,20 @@ class TestStaticModule(TestCase):
            bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
            top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
        ref_bot = bot_l(bot_inp)
-        acc_bot = bot_l_acc(bot_inp)[0]
+        acc_bot = bot_l_acc(bot_inp)
        torch.testing.assert_close(acc_bot, ref_bot)
        ref_top = top_l(top_inp)
-        acc_top = top_l_acc(top_inp)[0]
+        acc_top = top_l_acc(top_inp)
        torch.testing.assert_close(acc_top, ref_top)
        for _ in range(5):
            with torch.no_grad():
                bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
                top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
            ref_bot = bot_l(bot_inp)
-            acc_bot = bot_l_acc(bot_inp)[0]
+            acc_bot = bot_l_acc(bot_inp)
            torch.testing.assert_close(acc_bot, ref_bot)
            ref_top = top_l(top_inp)
-            acc_top = top_l_acc(top_inp)[0]
+            acc_top = top_l_acc(top_inp)
            torch.testing.assert_close(acc_top, ref_top)

    def test_trivial_graph(self):
@ -248,7 +248,7 @@ class TestStaticModule(TestCase):
        tg = torch.jit.script(trivial_graph)
        o_ref = tg(s, s, s)
        tg_a = StaticModule(tg)
-        o_test = tg_a(s, s, s)[0]
+        o_test = tg_a(s, s, s)
        torch.testing.assert_close(o_ref, o_test)

    def test_leaky_relu(self):
@ -256,7 +256,7 @@ class TestStaticModule(TestCase):
        tg = torch.jit.script(nn.LeakyReLU(0.1))
        o_ref = tg(s)
        tg_a = StaticModule(tg)
-        o_test = tg_a(s)[0]
+        o_test = tg_a(s)
        torch.testing.assert_close(o_ref, o_test)

    def test_attr(self):
@ -292,7 +292,7 @@ class TestStaticModule(TestCase):

        ms = torch.jit.script(m)
        sm = StaticModule(ms)
-        output_sm = sm(input)[0]
+        output_sm = sm(input)
        torch.testing.assert_close(output_s, output_sm)
        sm.benchmark([input], {}, 2, 2)
        sm.benchmark_individual_ops([input], {}, 2, 2)
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -285,6 +285,7 @@ core_sources_full_mobile_no_backend_interface = [
    "torch/csrc/jit/runtime/script_profile.cpp",
    "torch/csrc/jit/runtime/symbolic_script.cpp",
    "torch/csrc/jit/runtime/symbolic_shape_registry.cpp",
+    "torch/csrc/jit/runtime/jit_trace.cpp",
    "torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp",
    "torch/csrc/jit/serialization/import.cpp",
    "torch/csrc/jit/serialization/import_export_helpers.cpp",
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@ -639,27 +639,28 @@ def convert_fx(

        * `_remove_qconfig`: Option to remove the qconfig attributes in the model after convert.

-        * `qconfig_dict`: qconfig_dict with the either
-            a) same keys as what is passed to the qconfig_dict in prepare_fx API, with same values or `None`.
-            b) additional keys with values set to `None`
-            For each entry whose value is set to None, we skip quantizing that entry in the model.
-            Example:
+        * `qconfig_dict`: qconfig_dict with either same keys as what is passed to
+          the qconfig_dict in `prepare_fx` API, with same values or `None`, or
+          additional keys with values set to `None`
+
+          For each entry whose value is set to None, we skip quantizing that entry in the model::
+
            qconfig_dict = {
+              # used for object_type, skip quantizing torch.nn.functional.add
+              "object_type": [
+                (torch.nn.functional.add, None),
+                (torch.nn.functional.linear, qconfig_from_prepare)
+                ...,
+              ],

-                # used for object_type, skip quantizing torch.nn.functional.add
-                "object_type": [
-                    (torch.nn.functional.add, None),
-                    (torch.nn.functional.linear, qconfig_from_prepare)
-                    ...,
-                ],
-
-                # sed for module names, skip quantizing "foo.bar"
-                "module_name": [
-                    ("foo.bar", None)
-                    ...,
-                ],
+              # sed for module names, skip quantizing "foo.bar"
+              "module_name": [
+                ("foo.bar", None)
+                ...,
+              ],
            }

+
    Return:
        A quantized model (GraphModule)

--- a/torch/csrc/deploy/interpreter/builtin_registry.cpp
+++ b/torch/csrc/deploy/interpreter/builtin_registry.cpp
@ -11,11 +11,10 @@ namespace deploy {
 static const size_t NUM_FROZEN_PY_BUILTIN_MODULES = 6;
 static const size_t NUM_FROZEN_PY_STDLIB_MODULES = 680;

-extern "C" struct _frozen _PyImport_FrozenModules[];
 extern "C" struct _frozen _PyImport_FrozenModules_torch[];
 extern "C" PyObject* initModule(void);
+
 REGISTER_TORCH_DEPLOY_BUILTIN(cpython_internal, PyImport_FrozenModules);
-REGISTER_TORCH_DEPLOY_BUILTIN(frozenpython, _PyImport_FrozenModules);
 REGISTER_TORCH_DEPLOY_BUILTIN(
    frozentorch,
    _PyImport_FrozenModules_torch,
--- a/torch/csrc/deploy/interpreter/interpreter_impl.cpp
+++ b/torch/csrc/deploy/interpreter/interpreter_impl.cpp
@ -34,78 +34,6 @@ using namespace py::literals;
 #define PYOBJ_ASSERT(obj) assert(NULL != obj);
 #endif

-#define FOREACH_LIBRARY(_) \
-  _(array)                 \
-  _(_asyncio)              \
-  _(audioop)               \
-  _(binascii)              \
-  _(_bisect)               \
-  _(_blake2)               \
-  _(_bz2)                  \
-  _(cmath)                 \
-  _(_codecs_cn)            \
-  _(_codecs_hk)            \
-  _(_codecs_iso2022)       \
-  _(_codecs_jp)            \
-  _(_codecs_kr)            \
-  _(_codecs_tw)            \
-  _(_contextvars)          \
-  _(_crypt)                \
-  _(_csv)                  \
-  _(_ctypes)               \
-  _(_ctypes_test)          \
-  _(_curses)               \
-  _(_curses_panel)         \
-  _(_datetime)             \
-  _(_decimal)              \
-  _(_elementtree)          \
-  _(fcntl)                 \
-  _(grp)                   \
-  _(_hashlib)              \
-  _(_heapq)                \
-  _(_json)                 \
-  _(_lsprof)               \
-  _(_lzma)                 \
-  _(math)                  \
-  _(_md5)                  \
-  _(mmap)                  \
-  _(_multibytecodec)       \
-  _(_multiprocessing)      \
-  _(nis)                   \
-  _(_opcode)               \
-  _(ossaudiodev)           \
-  _(parser)                \
-  _(_pickle)               \
-  _(_posixsubprocess)      \
-  _(pyexpat)               \
-  _(_queue)                \
-  _(_random)               \
-  _(readline)              \
-  _(resource)              \
-  _(select)                \
-  _(_sha1)                 \
-  _(_sha256)               \
-  _(_sha3)                 \
-  _(_sha512)               \
-  _(_socket)               \
-  _(spwd)                  \
-  _(_ssl)                  \
-  _(_struct)               \
-  _(syslog)                \
-  _(termios)               \
-  _(_testbuffer)           \
-  _(_testcapi)             \
-  _(_testimportmultiple)   \
-  _(_testmultiphase)       \
-  _(unicodedata)           \
-  _(xxlimited)             \
-  _(_xxtestfuzz)           \
-  _(zlib)
-
-#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
-FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
-#undef DECLARE_LIBRARY_INIT
-
 const char* start = R"PYTHON(
 import _ssl # must come before _hashlib otherwise ssl's locks will be set to a Python that might no longer exist...
 import sys
@ -221,10 +149,6 @@ struct InitLockAcquire {
 struct __attribute__((visibility("hidden"))) ConcreteInterpreterImpl
    : public torch::deploy::InterpreterImpl {
  ConcreteInterpreterImpl() {
-#define APPEND_INIT(name) PyImport_AppendInittab(#name, PyInit_##name);
-    FOREACH_LIBRARY(APPEND_INIT)
-#undef APPEND_INIT
-
    BuiltinRegistry::runPreInitialization();

    PyPreConfig preconfig;
--- a/torch/csrc/deploy/interpreter/register_frozenpython.cpp
+++ b/torch/csrc/deploy/interpreter/register_frozenpython.cpp
@ -0,0 +1,82 @@
+#include <Python.h>
+#include <torch/csrc/deploy/interpreter/builtin_registry.h>
+
+#define FOREACH_LIBRARY(_) \
+  _(array)                 \
+  _(_asyncio)              \
+  _(audioop)               \
+  _(binascii)              \
+  _(_bisect)               \
+  _(_blake2)               \
+  _(_bz2)                  \
+  _(cmath)                 \
+  _(_codecs_cn)            \
+  _(_codecs_hk)            \
+  _(_codecs_iso2022)       \
+  _(_codecs_jp)            \
+  _(_codecs_kr)            \
+  _(_codecs_tw)            \
+  _(_contextvars)          \
+  _(_crypt)                \
+  _(_csv)                  \
+  _(_ctypes)               \
+  _(_ctypes_test)          \
+  _(_curses)               \
+  _(_curses_panel)         \
+  _(_datetime)             \
+  _(_decimal)              \
+  _(_elementtree)          \
+  _(fcntl)                 \
+  _(grp)                   \
+  _(_hashlib)              \
+  _(_heapq)                \
+  _(_json)                 \
+  _(_lsprof)               \
+  _(_lzma)                 \
+  _(math)                  \
+  _(_md5)                  \
+  _(mmap)                  \
+  _(_multibytecodec)       \
+  _(_multiprocessing)      \
+  _(nis)                   \
+  _(_opcode)               \
+  _(ossaudiodev)           \
+  _(parser)                \
+  _(_pickle)               \
+  _(_posixsubprocess)      \
+  _(pyexpat)               \
+  _(_queue)                \
+  _(_random)               \
+  _(readline)              \
+  _(resource)              \
+  _(select)                \
+  _(_sha1)                 \
+  _(_sha256)               \
+  _(_sha3)                 \
+  _(_sha512)               \
+  _(_socket)               \
+  _(spwd)                  \
+  _(_ssl)                  \
+  _(_struct)               \
+  _(syslog)                \
+  _(termios)               \
+  _(_testbuffer)           \
+  _(_testcapi)             \
+  _(_testimportmultiple)   \
+  _(_testmultiphase)       \
+  _(unicodedata)           \
+  _(xxlimited)             \
+  _(_xxtestfuzz)           \
+  _(zlib)
+
+#define DECLARE_LIBRARY_INIT(name) extern "C" PyObject* PyInit_##name(void);
+FOREACH_LIBRARY(DECLARE_LIBRARY_INIT)
+#undef DECLARE_LIBRARY_INIT
+
+extern "C" struct _frozen _PyImport_FrozenModules[];
+
+#define STD_LIBARY_PARMS(name) , #name, PyInit_##name
+REGISTER_TORCH_DEPLOY_BUILTIN(
+    frozenpython,
+    _PyImport_FrozenModules FOREACH_LIBRARY(STD_LIBARY_PARMS));
+#undef STD_LIBARY_PARMS
--- a/torch/csrc/jit/mobile/code.h
+++ b/torch/csrc/jit/mobile/code.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/operator_name.h>
+#include <torch/csrc/jit/runtime/instruction.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+using Stack = std::vector<c10::IValue>;
+using DebugHandle = int64_t;
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct Code {
+  std::vector<Instruction> instructions_;
+  std::vector<DebugHandle> debug_handles_;
+  std::vector<c10::OperatorName> op_names_;
+  std::vector<std::function<void(Stack&)>> operators_;
+  std::vector<c10::IValue> constants_;
+  std::vector<c10::TypePtr> types_;
+  size_t register_size_; // Aggregated output size.
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
--- a/torch/csrc/jit/mobile/frame.h
+++ b/torch/csrc/jit/mobile/frame.h
@ -0,0 +1,53 @@
+#pragma once
+
+#include <cstddef>
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/mobile/code.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+class Frame {
+ public:
+  explicit Frame(const Code& code) : code_(code) {}
+  const Code& getCode() const {
+    return code_;
+  }
+
+  void step() {
+    pc_++;
+  }
+
+  void jump(size_t n) {
+    pc_ += n;
+  }
+
+  size_t getPC() const {
+    return pc_;
+  }
+
+  const Instruction& getInstruction() const {
+    return code_.instructions_.at(pc_);
+  }
+
+  c10::optional<int64_t> getDebugHandle() const {
+    return getDebugHandle(pc_);
+  }
+
+  c10::optional<int64_t> getDebugHandle(size_t pc) const {
+    if (pc >= code_.debug_handles_.size()) {
+      return {};
+    }
+    return code_.debug_handles_[pc];
+  }
+
+ private:
+  const Code& code_;
+  size_t pc_{0};
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@ -167,7 +167,7 @@ bool Function::run(Stack& stack) const {
    schema->checkAndNormalizeInputs(
        stack, std::unordered_map<std::string, IValue>{} /*kwargs*/);
  }
-  InterpreterState interp_state(code_);
+  InterpreterState interp_state(*code_);
  return interp_state.run(stack);
 }

@ -181,8 +181,7 @@ const std::shared_ptr<Code> Function::get_code() const {
 }

 int64_t Function::getExceptionDebugHandle() const {
-  size_t pc = getInterpretersExceptionPC();
-  return (pc < code_->debug_handles_.size()) ? code_->debug_handles_[pc] : -1;
+  return getInterpretersExceptionDebugHandle();
 }

 } // namespace mobile
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@ -8,23 +8,22 @@
 #include <torch/csrc/jit/runtime/vararg_functions.h>

 #include <ATen/record_function.h>
+#include <c10/util/Exception.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/mobile/observer.h>
-
 #include <torch/csrc/jit/backends/backend_exception.h>
+#include <torch/csrc/jit/mobile/observer.h>

 namespace torch {
 namespace jit {
 char const* toString(OpCode op);
 std::ostream& operator<<(std::ostream& out, Instruction inst);
 namespace mobile {
-InterpreterState::InterpreterState(std::shared_ptr<Code> code)
-    : code_(std::move(code)) {
-  registers_.resize(code_->register_size_);
+InterpreterState::InterpreterState(const Code& code) {
+  enterFrame(code);
 }

 namespace {
-static thread_local int64_t exception_pc_{-1};
+static thread_local DebugHandle exception_debug_handle_{-1};
 void createObject(Stack& stack, const at::ClassTypePtr& type) {
  auto userObj = c10::ivalue::Object::create(
      c10::StrongTypePtr(type->compilation_unit(), type),
@ -46,21 +45,42 @@ void isinstance(Stack& stack, at::ArrayRef<at::TypePtr> types) {

 using namespace at;

-int64_t getInterpretersExceptionPC() {
-  return exception_pc_;
+int64_t getInterpretersExceptionDebugHandle() {
+  return exception_debug_handle_;
+}
+
+void InterpreterState::enterFrame(const Code& code) {
+  frames_.emplace_back(code);
+  registers_.resize(registers_.size() + code.register_size_);
+}
+
+void InterpreterState::leaveFrame() {
+  registers_.resize(
+      registers_.size() - frames_.back().getCode().register_size_);
+  frames_.pop_back();
+}
+
+void InterpreterState::saveExceptionDebugHandle() {
+  const auto& frame = frames_.back();
+  if (auto handle = frame.getDebugHandle()) {
+    exception_debug_handle_ = *handle;
+  }
 }

 bool InterpreterState::run(Stack& stack) {
-  size_t pc = 0;
  while (true) {
    try {
-      Instruction inst = code_->instructions_.at(pc);
-
+      auto& frame = frames_.back();
+      const auto& code = frame.getCode();
+      const auto pc = frame.getPC();
+      auto inst = frame.getInstruction();
      // If no valid debug handle found then just log pc.
      // This is possible when we did not save debug handles
-      DebugHandle debug_handle = pc >= code_->debug_handles_.size()
-          ? pc
-          : code_->debug_handles_.at(pc);
+
+      DebugHandle debug_handle = pc;
+      if (auto handle = frame.getDebugHandle()) {
+        debug_handle = *handle;
+      }

      // std::cout << "RUNNING " << pc << " "
      //           << code_->instructions_with_handles_[pc].instruction;
@ -93,63 +113,63 @@ bool InterpreterState::run(Stack& stack) {
          }

          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
-              code_->op_names_[inst.X].name, debug_handle, stack);
-          code_->operators_[inst.X](stack);
-          ++pc;
+              code.op_names_[inst.X].name, debug_handle, stack);
+          code.operators_[inst.X](stack);
+          frame.step();
        } break;
        case OPN: {
          stack.push_back(inst.N);
          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
-              code_->op_names_[inst.X].name, debug_handle, stack);
-          code_->operators_[inst.X](stack);
-          ++pc;
+              code.op_names_[inst.X].name, debug_handle, stack);
+          code.operators_[inst.X](stack);
+          frame.step();
        } break;
        case INTERFACE_CALL: {
          torch::jit::Function& method =
              peek(stack, 0, inst.N)
                  .toObject()
                  ->type()
-                  ->getMethod(code_->constants_[inst.X].toStringRef());
+                  ->getMethod(code.constants_[inst.X].toStringRef());
          RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS(
              method.name(), debug_handle, stack);
          method.run(stack);
-          ++pc;
+          frame.step();
        } break;
        case LOAD:
          stack.emplace_back(reg(inst.X));
-          ++pc;
+          frame.step();
          break;
        case MOVE:
          stack.emplace_back(std::move(reg(inst.X)));
-          ++pc;
+          frame.step();
          break;
        case STORE:
          reg(inst.X) = pop(stack);
-          ++pc;
+          frame.step();
          break;
        case STOREN:
          for (size_t i = inst.N; i > 0; --i) {
            reg(inst.X + i - 1) = pop(stack);
          }
-          ++pc;
+          frame.step();
          break;
        case DROP:
          pop(stack);
-          ++pc;
+          frame.step();
          break;
        case DROPR:
          reg(inst.X) = IValue();
-          ++pc;
+          frame.step();
          break;
        case LOADC:
-          stack.emplace_back(code_->constants_[inst.X]);
-          ++pc;
+          stack.emplace_back(code.constants_[inst.X]);
+          frame.step();
          break;
        case GET_ATTR: {
          auto userObj = pop(stack).toObject();
          auto value = userObj->getSlot(inst.X);
          push(stack, std::move(value));
-          ++pc;
+          frame.step();
        } break;
        case SET_ATTR: {
          auto v = pop(stack);
@ -163,72 +183,76 @@ bool InterpreterState::run(Stack& stack) {
            userObj->type()->addAttribute(ss.str(), c10::NoneType::get());
          }
          userObj->setSlot(inst.X, std::move(v));
-          ++pc;
+          frame.step();
        } break;
        case JF:
-          pc += (pop(stack).toBool()) ? 1 : inst.X;
+          frame.jump(pop(stack).toBool() ? 1 : inst.X);
          break;
        case JMP:
-          pc += inst.X;
+          frame.jump(inst.X);
          break;
        case LOOP: {
          // stack: iteration_count, max_iter, cond, loop_carried_deps...
-          auto frame = stack.end() - (inst.N + 1);
-          int64_t trip_count = frame[0].toInt();
-          int64_t max_trip_count = frame[1].toInt();
-          bool cond = frame[2].toBool();
+          auto sframe = stack.end() - (inst.N + 1);
+          int64_t trip_count = sframe[0].toInt();
+          int64_t max_trip_count = sframe[1].toInt();
+          bool cond = sframe[2].toBool();
          if (trip_count < max_trip_count && cond) {
-            frame[2] = trip_count;
-            frame[0] = trip_count + 1;
-            ++pc;
+            sframe[2] = trip_count;
+            sframe[0] = trip_count + 1;
+            frame.step();
          } else {
            size_t n_loop_carried = inst.N - 2;
            for (const auto i : c10::irange(n_loop_carried)) {
-              frame[i] = std::move(frame[i + 3]);
+              sframe[i] = std::move(sframe[i + 3]);
            }
            drop(stack, 3); // iteration_count, max_iter, cond
-            pc += inst.X;
+            frame.jump(inst.X);
          }
        } break;
        case RET:
+          leaveFrame();
+          if (frames_.size() > 0) {
+            continue;
+          }
          return false;
        case LIST_CONSTRUCT: {
-          const auto& type = code_->types_[inst.X]->expectRef<at::ListType>();
+          const auto& type = code.types_[inst.X]->expectRef<at::ListType>();
          listConstruct(stack, type, inst.N);
-          ++pc;
+          frame.step();
        } break;
        case LIST_UNPACK: {
          listUnpack(stack, inst.X);
-          ++pc;
+          frame.step();
        } break;
        case TUPLE_CONSTRUCT: {
          tupleConstruct(stack, inst.X);
-          ++pc;
+          frame.step();
        } break;
        case TUPLE_SLICE: {
          tupleSlice(stack, inst.X, inst.X + inst.N);
-          ++pc;
+          frame.step();
        } break;
        case DICT_CONSTRUCT: {
-          const auto& type = code_->types_[inst.X]->expectRef<at::DictType>();
+          const auto& type = code.types_[inst.X]->expectRef<at::DictType>();
          dictConstruct(stack, type, inst.N);
-          ++pc;
+          frame.step();
        } break;
        case NAMED_TUPLE_CONSTRUCT: {
          namedTupleConstruct(
-              stack, code_->types_[inst.X]->expect<at::TupleType>(), inst.N);
-          ++pc;
+              stack, code.types_[inst.X]->expect<at::TupleType>(), inst.N);
+          frame.step();
        } break;
        case CREATE_OBJECT: {
-          auto type = code_->types_[inst.X]->expect<c10::ClassType>();
+          auto type = code.types_[inst.X]->expect<c10::ClassType>();
          createObject(stack, type);
-          ++pc;
+          frame.step();
        } break;
        case ISINSTANCE: {
          at::ArrayRef<TypePtr> types(
-              &(code_->types_[inst.X]), &(code_->types_[inst.X + inst.N]));
+              &(code.types_[inst.X]), &(code.types_[inst.X + inst.N]));
          isinstance(stack, types);
-          ++pc;
+          frame.step();
        } break;
        case WARN: {
          drop(stack, 1);
@ -240,7 +264,7 @@ bool InterpreterState::run(Stack& stack) {
          const auto& sref = stack.back().toStringRef();
          TORCH_WARN(sref);
          stack.pop_back();
-          ++pc;
+          frame.step();
        } break;
        default:
          AT_ERROR(toString(inst.op), " is invalid.");
@ -251,15 +275,15 @@ bool InterpreterState::run(Stack& stack) {
      }
      // This exception must be caught first as it derived from c10::Error
    } catch (c10::BackendRuntimeException& e) {
-      exception_pc_ = pc;
+      saveExceptionDebugHandle();
      TORCH_RETHROW(e);
    } catch (c10::Error& error) {
      // Reason for catching and rethrowing the error is so that we can
      // set the exception pc that is queried later
-      exception_pc_ = pc;
+      saveExceptionDebugHandle();
      TORCH_RETHROW(error);
    } catch (...) {
-      exception_pc_ = pc;
+      saveExceptionDebugHandle();
      throw;
    }
    //  for (auto val : stack) {
--- a/torch/csrc/jit/mobile/interpreter.h
+++ b/torch/csrc/jit/mobile/interpreter.h
@ -1,35 +1,26 @@
 #pragma once
-#include <ATen/core/ivalue.h>
-#include <ATen/core/operator_name.h>
-#include <torch/csrc/jit/runtime/instruction.h>
+
 #include <vector>

+#include <torch/csrc/jit/mobile/code.h>
+#include <torch/csrc/jit/mobile/frame.h>
+
 namespace torch {
 namespace jit {
 namespace mobile {
-using Stack = std::vector<c10::IValue>;
-using DebugHandle = int64_t;
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct Code {
-  // TODO: Combine instructions and debug handles vector
-  // into std::vector<<std::pair<Instruction, DebugHandle>>
-  std::vector<Instruction> instructions_;
-  std::vector<DebugHandle> debug_handles_;
-  std::vector<c10::OperatorName> op_names_;
-  std::vector<std::function<void(Stack&)>> operators_;
-  std::vector<c10::IValue> constants_;
-  std::vector<c10::TypePtr> types_;
-  size_t register_size_; // Aggregated output size.
-};

 struct InterpreterState {
-  TORCH_API explicit InterpreterState(std::shared_ptr<Code> code);
+  TORCH_API explicit InterpreterState(const Code& code);
  TORCH_API bool run(Stack& stack);

 private:
-  std::shared_ptr<Code> code_;
+  void enterFrame(const Code&);
+  void leaveFrame();
+  void saveExceptionDebugHandle();
+
  c10::IValue& reg(size_t reg);
  std::vector<c10::IValue> registers_;
+  std::vector<Frame> frames_;
 };

 // Interpreter executes instruction in a loop one by one
@ -39,7 +30,7 @@ struct InterpreterState {
 // Note that this is set only when exception occurs.
 // since this is a thread local variable and setting it for
 // every instruction will add overhead of thread local variable access.
-int64_t getInterpretersExceptionPC();
+DebugHandle getInterpretersExceptionDebugHandle();
 } // namespace mobile
 } // namespace jit
 } // namespace torch
--- a/torch/csrc/jit/mobile/promoted_prim_ops.cpp
+++ b/torch/csrc/jit/mobile/promoted_prim_ops.cpp
@ -78,7 +78,7 @@ void _not(Stack& stack) {
 void boolTensor(Stack& stack) {
  at::Tensor a;
  pop(stack, a);
-  push(stack, a.is_nonzero());
+  push(stack, at::native::is_nonzero(a));
 }

 void toList(Stack& stack) {
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@ -17,6 +17,16 @@ namespace jit {

 namespace {

+std::vector<std::string> splitName(const std::string& name) {
+  std::vector<std::string> result;
+  std::string sub_name;
+  std::istringstream name_stream(name);
+  while (std::getline(name_stream, sub_name, '.')) {
+    result.push_back(std::move(sub_name));
+  }
+  return result;
+}
+
 class AttributePropagator {
 public:
  AttributePropagator(
@ -27,27 +37,27 @@ class AttributePropagator {
      : module_(module),
        freezeInterfaces_(freezeInterfaces),
        preserveParameters_(preserveParameters) {
-    // Currently only top level attributes and functions can  be preserved
-    // explicitly.
    auto checkName = [this](std::string& name) {
-      if (module_.hasattr(name)) {
-        auto attr = module_.attr(name);
+      const auto resolved_name = resolveName(name);

-        // Freezing client wants to presever this submodule. When cleaning
-        // the frozen module, make sure it will be preserved entirely.
-        if (attr.isModule()) {
-          preservedSubModule_.insert(attr.toModule()._ivalue());
+      if (resolved_name) {
+        const auto& parent_module = resolved_name->first;
+        const auto& attr_name = resolved_name->second;
+        if (parent_module.hasattr(attr_name)) {
+          auto value = parent_module.attr(attr_name);
+          // Freezing client wants to presever this submodule. When cleaning
+          // the frozen module, make sure it will be preserved entirely.
+          if (value.isModule()) {
+            preservedSubModule_.insert(value.toModule()._ivalue());
+          }
+          insertMutableAttr(attr_name, value, parent_module._ivalue());
+        } else {
+          auto fn = parent_module.get_method(attr_name);
+          preservedMethods_.insert(&fn.function());
        }
-        insertMutableAttr(name, attr, module_._ivalue());
        return true;
      }

-      for (auto& fn : module_.type()->methods()) {
-        if (fn->name() == name) {
-          preservedMethods_.insert(fn);
-          return true;
-        }
-      }
      return false;
    };

@ -119,6 +129,57 @@ class AttributePropagator {
  }

 private:
+  using ResolvedName = std::pair<Module, std::string>;
+
+  // Try to resolve qualified names (submodule1.submodule2.foo). If
+  // the qualified name exists in the root module, return the unqualified
+  // attribute/function name and the parent module. Else, return nullopt.
+  // Examples:
+  // submodule1.submodule2.foo -> {submodule2, "foo"}
+  // submodule1.non_existent_module.foo -> nullopt
+  c10::optional<ResolvedName> resolveName(const std::string& name) {
+    auto sub_names = splitName(name);
+    if (sub_names.empty()) {
+      return c10::nullopt;
+    }
+    auto& attr_name = sub_names.back();
+    auto cur_module = module_;
+    std::vector<ResolvedName> attr_infos;
+    attr_infos.reserve(sub_names.size() - 1);
+
+    for (size_t i = 0; i < sub_names.size() - 1; ++i) {
+      bool found = false;
+      const auto& sub_name = sub_names[i];
+      for (const auto& child_module : cur_module.named_children()) {
+        if (child_module.name == sub_name) {
+          attr_infos.emplace_back(cur_module._ivalue(), child_module.name);
+          cur_module = child_module.value;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        return c10::nullopt;
+      }
+    }
+
+    if (cur_module.hasattr(attr_name) || cur_module.find_method(attr_name)) {
+      // We don't want to mark these modules as mutable yet; that could
+      // interfere with the inlining procedure. Instead, we'll record
+      // the fact that the user wants to preserve them. They will be
+      // processed during clean-up preparation (recordReferenceAttrs)
+      for (auto& attr_info : attr_infos) {
+        const auto& parent_module = attr_info.first;
+        auto& sub_name = attr_info.second;
+        userPreservedAttrs_[parent_module._ivalue()].insert(
+            std::move(sub_name));
+      }
+      return std::make_pair(std::move(cur_module), std::move(attr_name));
+    }
+
+    return c10::nullopt;
+  }
+
  // findConstantAttr function locates the sub Module where attributes are
  // defined. The algorithm chases getAttr chains to locate the submodules.
  // For example:
@ -638,6 +699,16 @@ class AttributePropagator {
        }
      }
    }
+    // We have to process the attributes that the user wants to preserve
+    // separately since it's possible that the user-preserved module is
+    // never referenced in the graph.
+    for (const auto& attr_info : userPreservedAttrs_) {
+      const auto& parent_module = attr_info.first;
+      for (const auto& attr_name : attr_info.second) {
+        const auto value = parent_module->getAttr(attr_name);
+        insertMutableAttr(attr_name, value, parent_module);
+      }
+    }
  }

  // This function recursively iterates over submodules to identify
@ -710,7 +781,7 @@ class AttributePropagator {
        }
      }
      for (auto& fn : type->methods()) {
-        if (preservedMethods_.count(fn) && *type == *module_.type()) {
+        if (preservedMethods_.count(fn)) {
          continue;
        }
        funcsToRemove.push_back(fn);
@ -774,6 +845,11 @@ class AttributePropagator {
      c10::intrusive_ptr<at::ivalue::Object>>
      object_memo_;

+  // Contains names of attributes that the user wants to preserve with
+  // their owning modules.
+  std::unordered_map<ModulePtr, std::unordered_set<std::string>>
+      userPreservedAttrs_;
+
 }; // class AttributePropagator

 void checkModuleDoesNotReturnSelf(const Module& module) {
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@ -91,6 +91,7 @@
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/jit/runtime/jit_trace.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/jit/runtime/print_handler.h>
 #include <torch/csrc/jit/runtime/static/init.h>
@ -520,6 +521,22 @@ void initJITBindings(PyObject* module) {
          },
          py::doc(
              "Interpret a JIT graph with given inputs without running any optimization passes on it"))
+      .def(
+          "_jit_trace_graph",
+          [](std::shared_ptr<Graph>& graph, const py::tuple& inputs) {
+            Stack stack;
+            stack.reserve(inputs.size()); // captures?
+            for (auto& obj : inputs) {
+              stack.push_back(toTypeInferredIValue(obj));
+            }
+            auto g_inputs = graph->inputs();
+            for (const auto i : c10::irange(inputs.size())) {
+              if (stack[i].isTensor()) {
+                g_inputs[i]->setType(stack[i].type());
+              }
+            }
+            return TraceGraph(graph, stack);
+          })
      .def("_jit_pass_remove_expands", RemoveExpands)
      .def("_jit_pass_erase_number_types", EraseNumberTypes)
      .def("_jit_pass_inline_fork_wait", InlineForkWait)
--- a/Show More
+++ b/Show More